Revert "Placate mypy (redux)" (#188)

Reverts mediawiki-client-tools/mediawiki-dump-generator#186

This shouldn't have been merged quite yet, since it didn't pass the
tests.
pull/475/head
Elsie Hupp 9 months ago committed by GitHub
parent 20a1c7ae00
commit 6e5e01394a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -9,7 +9,7 @@ repos:
rev: 1.6.0
hooks:
- id: poetry-check
- id: poetry-lock
# - id: poetry-lock
- id: poetry-export
args: ["-f", "requirements.txt", "-o", "requirements.txt"]
- repo: https://github.com/pre-commit/pre-commit-hooks

293
poetry.lock generated

@ -1,9 +1,10 @@
# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
# This file is automatically @generated by Poetry and should not be changed by hand.
[[package]]
name = "atomicwrites"
version = "1.4.1"
description = "Atomic file writes."
category = "dev"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
files = [
@ -14,6 +15,7 @@ files = [
name = "attrs"
version = "23.1.0"
description = "Classes Without Boilerplate"
category = "dev"
optional = false
python-versions = ">=3.7"
files = [
@ -28,56 +30,11 @@ docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-
tests = ["attrs[tests-no-zope]", "zope-interface"]
tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
[[package]]
name = "black"
version = "23.7.0"
description = "The uncompromising code formatter."
optional = false
python-versions = ">=3.8"
files = [
{file = "black-23.7.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:5c4bc552ab52f6c1c506ccae05681fab58c3f72d59ae6e6639e8885e94fe2587"},
{file = "black-23.7.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:552513d5cd5694590d7ef6f46e1767a4df9af168d449ff767b13b084c020e63f"},
{file = "black-23.7.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:86cee259349b4448adb4ef9b204bb4467aae74a386bce85d56ba4f5dc0da27be"},
{file = "black-23.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:501387a9edcb75d7ae8a4412bb8749900386eaef258f1aefab18adddea1936bc"},
{file = "black-23.7.0-cp310-cp310-win_amd64.whl", hash = "sha256:fb074d8b213749fa1d077d630db0d5f8cc3b2ae63587ad4116e8a436e9bbe995"},
{file = "black-23.7.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:b5b0ee6d96b345a8b420100b7d71ebfdd19fab5e8301aff48ec270042cd40ac2"},
{file = "black-23.7.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:893695a76b140881531062d48476ebe4a48f5d1e9388177e175d76234ca247cd"},
{file = "black-23.7.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:c333286dc3ddca6fdff74670b911cccedacb4ef0a60b34e491b8a67c833b343a"},
{file = "black-23.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:831d8f54c3a8c8cf55f64d0422ee875eecac26f5f649fb6c1df65316b67c8926"},
{file = "black-23.7.0-cp311-cp311-win_amd64.whl", hash = "sha256:7f3bf2dec7d541b4619b8ce526bda74a6b0bffc480a163fed32eb8b3c9aed8ad"},
{file = "black-23.7.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:f9062af71c59c004cd519e2fb8f5d25d39e46d3af011b41ab43b9c74e27e236f"},
{file = "black-23.7.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:01ede61aac8c154b55f35301fac3e730baf0c9cf8120f65a9cd61a81cfb4a0c3"},
{file = "black-23.7.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:327a8c2550ddc573b51e2c352adb88143464bb9d92c10416feb86b0f5aee5ff6"},
{file = "black-23.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d1c6022b86f83b632d06f2b02774134def5d4d4f1dac8bef16d90cda18ba28a"},
{file = "black-23.7.0-cp38-cp38-win_amd64.whl", hash = "sha256:27eb7a0c71604d5de083757fbdb245b1a4fae60e9596514c6ec497eb63f95320"},
{file = "black-23.7.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:8417dbd2f57b5701492cd46edcecc4f9208dc75529bcf76c514864e48da867d9"},
{file = "black-23.7.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:47e56d83aad53ca140da0af87678fb38e44fd6bc0af71eebab2d1f59b1acf1d3"},
{file = "black-23.7.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:25cc308838fe71f7065df53aedd20327969d05671bac95b38fdf37ebe70ac087"},
{file = "black-23.7.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:642496b675095d423f9b8448243336f8ec71c9d4d57ec17bf795b67f08132a91"},
{file = "black-23.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:ad0014efc7acf0bd745792bd0d8857413652979200ab924fbf239062adc12491"},
{file = "black-23.7.0-py3-none-any.whl", hash = "sha256:9fd59d418c60c0348505f2ddf9609c1e1de8e7493eab96198fc89d9f865e7a96"},
{file = "black-23.7.0.tar.gz", hash = "sha256:022a582720b0d9480ed82576c920a8c1dde97cc38ff11d8d8859b3bd6ca9eedb"},
]
[package.dependencies]
click = ">=8.0.0"
mypy-extensions = ">=0.4.3"
packaging = ">=22.0"
pathspec = ">=0.9.0"
platformdirs = ">=2"
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""}
[package.extras]
colorama = ["colorama (>=0.4.3)"]
d = ["aiohttp (>=3.7.4)"]
jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"]
uvloop = ["uvloop (>=0.15.2)"]
[[package]]
name = "certifi"
version = "2023.7.22"
description = "Python package for providing Mozilla's CA Bundle."
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -89,6 +46,7 @@ files = [
name = "cfgv"
version = "3.4.0"
description = "Validate configuration and produce human readable error messages."
category = "dev"
optional = false
python-versions = ">=3.8"
files = [
@ -100,6 +58,7 @@ files = [
name = "charset-normalizer"
version = "3.2.0"
description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
category = "main"
optional = false
python-versions = ">=3.7.0"
files = [
@ -180,24 +139,11 @@ files = [
{file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"},
]
[[package]]
name = "click"
version = "8.1.7"
description = "Composable command line interface toolkit"
optional = false
python-versions = ">=3.7"
files = [
{file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"},
{file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"},
]
[package.dependencies]
colorama = {version = "*", markers = "platform_system == \"Windows\""}
[[package]]
name = "colorama"
version = "0.4.6"
description = "Cross-platform colored terminal text."
category = "main"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
files = [
@ -209,6 +155,7 @@ files = [
name = "contextlib2"
version = "21.6.0"
description = "Backports and enhancements for the contextlib module"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -220,6 +167,7 @@ files = [
name = "distlib"
version = "0.3.7"
description = "Distribution utilities"
category = "dev"
optional = false
python-versions = "*"
files = [
@ -231,6 +179,7 @@ files = [
name = "docopt"
version = "0.6.2"
description = "Pythonic argument parser, that will make you smile"
category = "main"
optional = false
python-versions = "*"
files = [
@ -241,6 +190,7 @@ files = [
name = "file-read-backwards"
version = "2.0.0"
description = "Memory efficient way of reading files line-by-line from the end of file"
category = "main"
optional = false
python-versions = "*"
files = [
@ -250,26 +200,25 @@ files = [
[[package]]
name = "filelock"
version = "3.12.3"
version = "3.12.2"
description = "A platform independent file lock."
category = "dev"
optional = false
python-versions = ">=3.8"
python-versions = ">=3.7"
files = [
{file = "filelock-3.12.3-py3-none-any.whl", hash = "sha256:f067e40ccc40f2b48395a80fcbd4728262fab54e232e090a4063ab804179efeb"},
{file = "filelock-3.12.3.tar.gz", hash = "sha256:0ecc1dd2ec4672a10c8550a8182f1bd0c0a5088470ecd5a125e45f49472fac3d"},
{file = "filelock-3.12.2-py3-none-any.whl", hash = "sha256:cbb791cdea2a72f23da6ac5b5269ab0a0d161e9ef0100e653b69049a7706d1ec"},
{file = "filelock-3.12.2.tar.gz", hash = "sha256:002740518d8aa59a26b0c76e10fb8c6e15eae825d34b6fdf670333fd7b938d81"},
]
[package.dependencies]
typing-extensions = {version = ">=4.7.1", markers = "python_version < \"3.11\""}
[package.extras]
docs = ["furo (>=2023.7.26)", "sphinx (>=7.1.2)", "sphinx-autodoc-typehints (>=1.24)"]
testing = ["covdefaults (>=2.3)", "coverage (>=7.3)", "diff-cover (>=7.7)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)", "pytest-timeout (>=2.1)"]
docs = ["furo (>=2023.5.20)", "sphinx (>=7.0.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"]
testing = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "diff-cover (>=7.5)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)", "pytest-timeout (>=2.1)"]
[[package]]
name = "flake8"
version = "3.9.2"
description = "the modular source code checker: pep8 pyflakes and co"
category = "dev"
optional = false
python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7"
files = [
@ -282,34 +231,16 @@ mccabe = ">=0.6.0,<0.7.0"
pycodestyle = ">=2.7.0,<2.8.0"
pyflakes = ">=2.3.0,<2.4.0"
[[package]]
name = "flake8-black"
version = "0.3.6"
description = "flake8 plugin to call black as a code style validator"
optional = false
python-versions = ">=3.7"
files = [
{file = "flake8-black-0.3.6.tar.gz", hash = "sha256:0dfbca3274777792a5bcb2af887a4cad72c72d0e86c94e08e3a3de151bb41c34"},
{file = "flake8_black-0.3.6-py3-none-any.whl", hash = "sha256:fe8ea2eca98d8a504f22040d9117347f6b367458366952862ac3586e7d4eeaca"},
]
[package.dependencies]
black = ">=22.1.0"
flake8 = ">=3"
tomli = {version = "*", markers = "python_version < \"3.11\""}
[package.extras]
develop = ["build", "twine"]
[[package]]
name = "identify"
version = "2.5.27"
version = "2.5.26"
description = "File identification library for Python"
category = "dev"
optional = false
python-versions = ">=3.8"
files = [
{file = "identify-2.5.27-py2.py3-none-any.whl", hash = "sha256:fdb527b2dfe24602809b2201e033c2a113d7bdf716db3ca8e3243f735dcecaba"},
{file = "identify-2.5.27.tar.gz", hash = "sha256:287b75b04a0e22d727bc9a41f0d4f3c1bcada97490fa6eabb5b28f0e9097e733"},
{file = "identify-2.5.26-py2.py3-none-any.whl", hash = "sha256:c22a8ead0d4ca11f1edd6c9418c3220669b3b7533ada0a0ffa6cc0ef85cf9b54"},
{file = "identify-2.5.26.tar.gz", hash = "sha256:7243800bce2f58404ed41b7c002e53d4d22bcf3ae1b7900c2d7aefd95394bf7f"},
]
[package.extras]
@ -319,6 +250,7 @@ license = ["ukkonen"]
name = "idna"
version = "3.4"
description = "Internationalized Domain Names in Applications (IDNA)"
category = "main"
optional = false
python-versions = ">=3.5"
files = [
@ -330,6 +262,7 @@ files = [
name = "iniconfig"
version = "2.0.0"
description = "brain-dead simple config-ini parsing"
category = "dev"
optional = false
python-versions = ">=3.7"
files = [
@ -341,6 +274,7 @@ files = [
name = "internetarchive"
version = "3.5.0"
description = "A Python interface to archive.org."
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -366,6 +300,7 @@ types = ["tqdm-stubs (>=0.2.0)", "types-colorama", "types-docopt (>=0.6.10,<0.7.
name = "jsonpatch"
version = "1.33"
description = "Apply JSON-Patches (RFC 6902)"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
files = [
@ -380,6 +315,7 @@ jsonpointer = ">=1.9"
name = "jsonpointer"
version = "2.4"
description = "Identify specific nodes in a JSON document (RFC 6901)"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
files = [
@ -391,6 +327,7 @@ files = [
name = "lxml"
version = "4.9.3"
description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API."
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*"
files = [
@ -498,6 +435,7 @@ source = ["Cython (>=0.29.35)"]
name = "mccabe"
version = "0.6.1"
description = "McCabe checker, plugin for flake8"
category = "dev"
optional = false
python-versions = "*"
files = [
@ -509,6 +447,7 @@ files = [
name = "mwclient"
version = "0.10.1"
description = "MediaWiki API client"
category = "main"
optional = false
python-versions = "*"
files = [
@ -520,67 +459,11 @@ files = [
requests-oauthlib = "*"
six = "*"
[[package]]
name = "mypy"
version = "1.5.1"
description = "Optional static typing for Python"
optional = false
python-versions = ">=3.8"
files = [
{file = "mypy-1.5.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f33592ddf9655a4894aef22d134de7393e95fcbdc2d15c1ab65828eee5c66c70"},
{file = "mypy-1.5.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:258b22210a4a258ccd077426c7a181d789d1121aca6db73a83f79372f5569ae0"},
{file = "mypy-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9ec1f695f0c25986e6f7f8778e5ce61659063268836a38c951200c57479cc12"},
{file = "mypy-1.5.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:abed92d9c8f08643c7d831300b739562b0a6c9fcb028d211134fc9ab20ccad5d"},
{file = "mypy-1.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:a156e6390944c265eb56afa67c74c0636f10283429171018446b732f1a05af25"},
{file = "mypy-1.5.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6ac9c21bfe7bc9f7f1b6fae441746e6a106e48fc9de530dea29e8cd37a2c0cc4"},
{file = "mypy-1.5.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:51cb1323064b1099e177098cb939eab2da42fea5d818d40113957ec954fc85f4"},
{file = "mypy-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:596fae69f2bfcb7305808c75c00f81fe2829b6236eadda536f00610ac5ec2243"},
{file = "mypy-1.5.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:32cb59609b0534f0bd67faebb6e022fe534bdb0e2ecab4290d683d248be1b275"},
{file = "mypy-1.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:159aa9acb16086b79bbb0016145034a1a05360626046a929f84579ce1666b315"},
{file = "mypy-1.5.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f6b0e77db9ff4fda74de7df13f30016a0a663928d669c9f2c057048ba44f09bb"},
{file = "mypy-1.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26f71b535dfc158a71264e6dc805a9f8d2e60b67215ca0bfa26e2e1aa4d4d373"},
{file = "mypy-1.5.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fc3a600f749b1008cc75e02b6fb3d4db8dbcca2d733030fe7a3b3502902f161"},
{file = "mypy-1.5.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:26fb32e4d4afa205b24bf645eddfbb36a1e17e995c5c99d6d00edb24b693406a"},
{file = "mypy-1.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:82cb6193de9bbb3844bab4c7cf80e6227d5225cc7625b068a06d005d861ad5f1"},
{file = "mypy-1.5.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4a465ea2ca12804d5b34bb056be3a29dc47aea5973b892d0417c6a10a40b2d65"},
{file = "mypy-1.5.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9fece120dbb041771a63eb95e4896791386fe287fefb2837258925b8326d6160"},
{file = "mypy-1.5.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d28ddc3e3dfeab553e743e532fb95b4e6afad51d4706dd22f28e1e5e664828d2"},
{file = "mypy-1.5.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:57b10c56016adce71fba6bc6e9fd45d8083f74361f629390c556738565af8eeb"},
{file = "mypy-1.5.1-cp38-cp38-win_amd64.whl", hash = "sha256:ff0cedc84184115202475bbb46dd99f8dcb87fe24d5d0ddfc0fe6b8575c88d2f"},
{file = "mypy-1.5.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8f772942d372c8cbac575be99f9cc9d9fb3bd95c8bc2de6c01411e2c84ebca8a"},
{file = "mypy-1.5.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5d627124700b92b6bbaa99f27cbe615c8ea7b3402960f6372ea7d65faf376c14"},
{file = "mypy-1.5.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:361da43c4f5a96173220eb53340ace68cda81845cd88218f8862dfb0adc8cddb"},
{file = "mypy-1.5.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:330857f9507c24de5c5724235e66858f8364a0693894342485e543f5b07c8693"},
{file = "mypy-1.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:c543214ffdd422623e9fedd0869166c2f16affe4ba37463975043ef7d2ea8770"},
{file = "mypy-1.5.1-py3-none-any.whl", hash = "sha256:f757063a83970d67c444f6e01d9550a7402322af3557ce7630d3c957386fa8f5"},
{file = "mypy-1.5.1.tar.gz", hash = "sha256:b031b9601f1060bf1281feab89697324726ba0c0bae9d7cd7ab4b690940f0b92"},
]
[package.dependencies]
mypy-extensions = ">=1.0.0"
tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""}
typing-extensions = ">=4.1.0"
[package.extras]
dmypy = ["psutil (>=4.0)"]
install-types = ["pip"]
reports = ["lxml"]
[[package]]
name = "mypy-extensions"
version = "1.0.0"
description = "Type system extensions for programs checked with the mypy type checker."
optional = false
python-versions = ">=3.5"
files = [
{file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"},
{file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"},
]
[[package]]
name = "nodeenv"
version = "1.8.0"
description = "Node.js virtual environment builder"
category = "dev"
optional = false
python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*"
files = [
@ -595,6 +478,7 @@ setuptools = "*"
name = "oauthlib"
version = "3.2.2"
description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic"
category = "main"
optional = false
python-versions = ">=3.6"
files = [
@ -611,6 +495,7 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"]
name = "packaging"
version = "23.1"
description = "Core utilities for Python packages"
category = "dev"
optional = false
python-versions = ">=3.7"
files = [
@ -618,21 +503,11 @@ files = [
{file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"},
]
[[package]]
name = "pathspec"
version = "0.11.2"
description = "Utility library for gitignore style pattern matching of file paths."
optional = false
python-versions = ">=3.7"
files = [
{file = "pathspec-0.11.2-py3-none-any.whl", hash = "sha256:1d6ed233af05e679efb96b1851550ea95bbb64b7c490b0f5aa52996c11e92a20"},
{file = "pathspec-0.11.2.tar.gz", hash = "sha256:e0d8d0ac2f12da61956eb2306b69f9469b42f4deb0f3cb6ed47b9cce9996ced3"},
]
[[package]]
name = "platformdirs"
version = "3.10.0"
description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"."
category = "dev"
optional = false
python-versions = ">=3.7"
files = [
@ -646,13 +521,14 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-co
[[package]]
name = "pluggy"
version = "1.3.0"
version = "1.2.0"
description = "plugin and hook calling mechanisms for python"
category = "dev"
optional = false
python-versions = ">=3.8"
python-versions = ">=3.7"
files = [
{file = "pluggy-1.3.0-py3-none-any.whl", hash = "sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7"},
{file = "pluggy-1.3.0.tar.gz", hash = "sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12"},
{file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"},
{file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"},
]
[package.extras]
@ -663,6 +539,7 @@ testing = ["pytest", "pytest-benchmark"]
name = "poster3"
version = "0.8.1"
description = "Streaming HTTP uploads and multipart/form-data encoding"
category = "main"
optional = false
python-versions = "*"
files = [
@ -676,6 +553,7 @@ poster3 = ["buildutils", "sphinx"]
name = "pre-commit"
version = "2.21.0"
description = "A framework for managing and maintaining multi-language pre-commit hooks."
category = "dev"
optional = false
python-versions = ">=3.7"
files = [
@ -694,6 +572,7 @@ virtualenv = ">=20.10.0"
name = "pre-commit-poetry-export"
version = "0.1.2"
description = "pre-commit hook to keep requirements.txt updated"
category = "main"
optional = false
python-versions = ">=3.8,<4.0"
files = [
@ -705,6 +584,7 @@ files = [
name = "py"
version = "1.11.0"
description = "library with cross-python path, ini-parsing, io, code, log facilities"
category = "dev"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
files = [
@ -716,6 +596,7 @@ files = [
name = "pycodestyle"
version = "2.7.0"
description = "Python style guide checker"
category = "dev"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
files = [
@ -727,6 +608,7 @@ files = [
name = "pyflakes"
version = "2.3.1"
description = "passive checker of Python programs"
category = "dev"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
files = [
@ -738,6 +620,7 @@ files = [
name = "pymarkdown"
version = "0.1.4"
description = "Evaluate code in markdown"
category = "dev"
optional = false
python-versions = "*"
files = [
@ -751,6 +634,7 @@ toolz = "*"
name = "pymysql"
version = "1.1.0"
description = "Pure Python MySQL Driver"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -766,6 +650,7 @@ rsa = ["cryptography"]
name = "pytest"
version = "6.2.5"
description = "pytest: simple powerful testing with Python"
category = "dev"
optional = false
python-versions = ">=3.6"
files = [
@ -790,6 +675,7 @@ testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xm
name = "pywikibot"
version = "6.6.5"
description = "Python MediaWiki Bot Framework"
category = "main"
optional = false
python-versions = ">=3.5.0"
files = [
@ -826,6 +712,7 @@ wikitextparser = ["wikitextparser (>=0.47.0)", "wikitextparser (>=0.47.5)"]
name = "pyyaml"
version = "6.0.1"
description = "YAML parser and emitter for Python"
category = "dev"
optional = false
python-versions = ">=3.6"
files = [
@ -834,7 +721,6 @@ files = [
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
{file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
{file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
{file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
{file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
{file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
@ -842,15 +728,8 @@ files = [
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
{file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
{file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
{file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
{file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
{file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
{file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
{file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
{file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
{file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
{file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
{file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
{file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
@ -867,7 +746,6 @@ files = [
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
{file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
{file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
{file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
{file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
{file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
@ -875,7 +753,6 @@ files = [
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
{file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
{file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
{file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
{file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
{file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
@ -885,6 +762,7 @@ files = [
name = "requests"
version = "2.31.0"
description = "Python HTTP for Humans."
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -906,6 +784,7 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
name = "requests-oauthlib"
version = "1.3.1"
description = "OAuthlib authentication support for Requests."
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*"
files = [
@ -924,6 +803,7 @@ rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
name = "schema"
version = "0.7.5"
description = "Simple data validation library"
category = "main"
optional = false
python-versions = "*"
files = [
@ -938,6 +818,7 @@ contextlib2 = ">=0.5.5"
name = "setuptools"
version = "68.1.2"
description = "Easily download, build, install, upgrade, and uninstall Python packages"
category = "main"
optional = false
python-versions = ">=3.8"
files = [
@ -954,6 +835,7 @@ testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (
name = "six"
version = "1.16.0"
description = "Python 2 and 3 compatibility utilities"
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*"
files = [
@ -965,6 +847,7 @@ files = [
name = "toml"
version = "0.10.2"
description = "Python Library for Tom's Obvious, Minimal Language"
category = "dev"
optional = false
python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*"
files = [
@ -972,21 +855,11 @@ files = [
{file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"},
]
[[package]]
name = "tomli"
version = "2.0.1"
description = "A lil' TOML parser"
optional = false
python-versions = ">=3.7"
files = [
{file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"},
{file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
]
[[package]]
name = "toolz"
version = "0.12.0"
description = "List processing tools and functional utilities"
category = "dev"
optional = false
python-versions = ">=3.5"
files = [
@ -998,6 +871,7 @@ files = [
name = "tqdm"
version = "4.66.1"
description = "Fast, Extensible Progress Meter"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -1014,46 +888,11 @@ notebook = ["ipywidgets (>=6)"]
slack = ["slack-sdk"]
telegram = ["requests"]
[[package]]
name = "types-requests"
version = "2.31.0.2"
description = "Typing stubs for requests"
optional = false
python-versions = "*"
files = [
{file = "types-requests-2.31.0.2.tar.gz", hash = "sha256:6aa3f7faf0ea52d728bb18c0a0d1522d9bfd8c72d26ff6f61bfc3d06a411cf40"},
{file = "types_requests-2.31.0.2-py3-none-any.whl", hash = "sha256:56d181c85b5925cbc59f4489a57e72a8b2166f18273fd8ba7b6fe0c0b986f12a"},
]
[package.dependencies]
types-urllib3 = "*"
[[package]]
name = "types-urllib3"
version = "1.26.25.14"
description = "Typing stubs for urllib3"
optional = false
python-versions = "*"
files = [
{file = "types-urllib3-1.26.25.14.tar.gz", hash = "sha256:229b7f577c951b8c1b92c1bc2b2fdb0b49847bd2af6d1cc2a2e3dd340f3bda8f"},
{file = "types_urllib3-1.26.25.14-py3-none-any.whl", hash = "sha256:9683bbb7fb72e32bfe9d2be6e04875fbe1b3eeec3cbb4ea231435aa7fd6b4f0e"},
]
[[package]]
name = "typing-extensions"
version = "4.7.1"
description = "Backported and Experimental Type Hints for Python 3.7+"
optional = false
python-versions = ">=3.7"
files = [
{file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"},
{file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"},
]
[[package]]
name = "urllib3"
version = "1.26.16"
description = "HTTP library with thread-safe connection pooling, file post, and more."
category = "main"
optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*"
files = [
@ -1068,13 +907,14 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
[[package]]
name = "virtualenv"
version = "20.24.4"
version = "20.24.3"
description = "Virtual Python Environment builder"
category = "dev"
optional = false
python-versions = ">=3.7"
files = [
{file = "virtualenv-20.24.4-py3-none-any.whl", hash = "sha256:29c70bb9b88510f6414ac3e55c8b413a1f96239b6b789ca123437d5e892190cb"},
{file = "virtualenv-20.24.4.tar.gz", hash = "sha256:772b05bfda7ed3b8ecd16021ca9716273ad9f4467c801f27e83ac73430246dca"},
{file = "virtualenv-20.24.3-py3-none-any.whl", hash = "sha256:95a6e9398b4967fbcb5fef2acec5efaf9aa4972049d9ae41f95e0972a683fd02"},
{file = "virtualenv-20.24.3.tar.gz", hash = "sha256:e5c3b4ce817b0b328af041506a2a299418c98747c4b1e68cb7527e74ced23efc"},
]
[package.dependencies]
@ -1083,13 +923,14 @@ filelock = ">=3.12.2,<4"
platformdirs = ">=3.9.1,<4"
[package.extras]
docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"]
test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"]
[[package]]
name = "wikitools3"
version = "3.0.1"
description = "Python package for interacting with a MediaWiki wiki. It is used by WikiTeam for archiving MediaWiki wikis."
category = "main"
optional = false
python-versions = ">=3.8,<4.0"
files = [
@ -1103,4 +944,4 @@ poster3 = ">=0.8.1,<0.9.0"
[metadata]
lock-version = "2.0"
python-versions = "^3.8"
content-hash = "ebed56288c755209a5da1b75673fdda769a85b22d5f1c26fcb7492d971ffd617"
content-hash = "1eee6035c5660e8cba28942140937e2ceb36bf90482e76fa5ddd054efa3c659c"

@ -77,10 +77,6 @@ requests = "^2.31.0"
flake8 = "^3.9.2"
pre-commit = "^2.17.0"
pymarkdown = "^0.1.4"
mypy = "^1.5.1"
types-requests = "^2.31.0.2"
# flake8-black may be unnecessary?
flake8-black = "^0.3.6"
[build-system]
requires = ["poetry-core>=1.0.0"]
@ -88,7 +84,3 @@ build-backend = "poetry.core.masonry.api"
[tool.pymarkdown]
disable-rules = "line-length,no-inline-html"
[tool.mypy]
check_untyped_defs = true
ignore_missing_imports = true

@ -0,0 +1,26 @@
#!/usr/bin/env python3
# DumpGenerator A generator of dumps for wikis
# Copyright (C) 2011-2018 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# To learn more, read the documentation:
# https://github.com/WikiTeam/wikiteam/wiki
from wikiteam3.dumpgenerator.dump import DumpGenerator
def main():
DumpGenerator()

@ -1,32 +1,6 @@
#!/usr/bin/env python3
# DumpGenerator A generator of dumps for wikis
# Copyright (C) 2011-2018 WikiTeam developers
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# To learn more, read the documentation:
# https://github.com/WikiTeam/wikiteam/wiki
from wikiteam3.dumpgenerator.dump import DumpGenerator
def main():
DumpGenerator()
if __name__ == "__main__":
import sys
from .__init__ import main
sys.exit(main())

@ -2,5 +2,3 @@ from .api import checkAPI, checkRetryAPI, mwGetAPIAndIndex
from .get_json import getJSON
from .handle_status_code import handleStatusCode
from .wiki_check import getWikiEngine
__all__ = [checkAPI, checkRetryAPI, mwGetAPIAndIndex, getJSON, handleStatusCode, getWikiEngine] # type: ignore

@ -1,6 +1,7 @@
import re
from typing import Any, Literal, Optional
from urllib.parse import urljoin, urlparse
import time
from typing import *
from urllib.parse import urljoin, urlparse, urlunparse
import mwclient
import requests
@ -10,8 +11,7 @@ from wikiteam3.utils import getUserAgent
from .get_json import getJSON
# api="", session: requests.Session = None
def checkAPI(api: str, session: requests.Session):
def checkAPI(api="", session: requests.Session = None):
"""Checking API availability"""
global cj
# handle redirects
@ -34,31 +34,29 @@ def checkAPI(api: str, session: requests.Session):
"MediaWiki API URL not found or giving error: HTTP %d" % r.status_code
)
return None
if r is not None:
if "MediaWiki API is not enabled for this site." in r.text:
return None
try:
result = getJSON(r)
index = None
if result:
try:
index = (
result["query"]["general"]["server"]
+ result["query"]["general"]["script"]
)
return (True, index, api)
except KeyError:
print("MediaWiki API seems to work but returned no index URL")
return (True, None, api)
except ValueError:
print(repr(r.text))
print("MediaWiki API returned data we could not parse")
return None
if "MediaWiki API is not enabled for this site." in r.text:
return None
try:
result = getJSON(r)
index = None
if result:
try:
index = (
result["query"]["general"]["server"]
+ result["query"]["general"]["script"]
)
return (True, index, api)
except KeyError:
print("MediaWiki API seems to work but returned no index URL")
return (True, None, api)
except ValueError:
print(repr(r.text))
print("MediaWiki API returned data we could not parse")
return None
return None
# url=""
def mwGetAPIAndIndex(url: str, session: requests.Session):
def mwGetAPIAndIndex(url="", session: requests.Session = None):
"""Returns the MediaWiki API and Index.php"""
api = ""
@ -110,21 +108,18 @@ def mwGetAPIAndIndex(url: str, session: requests.Session):
return api, index
# api="", apiclient=False
def checkRetryAPI(api: str, apiclient: bool, session: requests.Session):
def checkRetryAPI(api="", apiclient=False, session: requests.Session = None):
"""Call checkAPI and mwclient if necessary"""
check: (tuple[Literal[True], Any, str] | tuple[Literal[True], None, str] | None)
check = None
try:
check = checkAPI(api, session=session)
except requests.exceptions.ConnectionError as e:
print(f"Connection error: {str(e)}")
check = None
if check and apiclient:
apiurl = urlparse(api)
try:
# Returns a value, but we're just checking for an error here
mwclient.Site(
site = mwclient.Site(
apiurl.netloc,
apiurl.path.replace("api.php", ""),
scheme=apiurl.scheme,
@ -143,14 +138,13 @@ def checkRetryAPI(api: str, apiclient: bool, session: requests.Session):
)
try:
# Returns a value, but we're just checking for an error here
mwclient.Site(
site = mwclient.Site(
apiurl.netloc,
apiurl.path.replace("api.php", ""),
scheme=newscheme,
pool=session,
)
except KeyError:
check = False # type: ignore
check = False
return check, api # type: ignore
return check, api

@ -8,6 +8,6 @@ def getJSON(request: requests.Response):
# request.encoding = request.apparent_encoding
try:
return request.json()
except Exception:
except:
# Maybe an older API version which did not return correct JSON
return {}

@ -3,10 +3,9 @@ import re
import requests
# index="", cookies="", session=None
def checkIndex(index: str, cookies: str, session: requests.Session):
def checkIndex(index="", cookies="", session: requests.Session = None):
"""Checking index.php availability"""
r = session.post(url=index, data={"title": "Special:Version"}, timeout=30) # type: ignore
r = session.post(url=index, data={"title": "Special:Version"}, timeout=30)
if r.status_code >= 400:
print(f"ERROR: The wiki returned status code HTTP {r.status_code}")
return False

@ -1,50 +1,53 @@
import re
import requests
from wikiteam3.dumpgenerator.api import getJSON
from wikiteam3.dumpgenerator.cli import Delay
from wikiteam3.dumpgenerator.config import Config
def getNamespacesScraper(config: Config, session: requests.Session):
def getNamespacesScraper(config: Config = None, session=None):
"""Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages"""
"""Function called if no API is available"""
namespaces = config.namespaces
# namespacenames = {0: ""} # main is 0, no prefix
namespacenames = {0: ""} # main is 0, no prefix
if namespaces:
r = session.post(
url=config.index, params={"title": "Special:Allpages"}, timeout=30 # type: ignore
url=config.index, params={"title": "Special:Allpages"}, timeout=30
)
raw = r.text
Delay(config=config)
Delay(config=config, session=session)
# [^>]*? to include selected="selected"
m = re.compile(
r'<option [^>]*?value=[\'"](?P<namespaceid>\d+)[\'"][^>]*?>(?P<namespacename>[^<]+)</option>'
).finditer(raw)
if "all" in namespaces:
namespaces = [int(i.group("namespaceid")) for i in m]
# namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
namespaces = []
for i in m:
namespaces.append(int(i.group("namespaceid")))
namespacenames[int(i.group("namespaceid"))] = i.group("namespacename")
else:
namespaces2 = [
int(i.group("namespaceid"))
for i in m
if int(i.group("namespaceid")) in namespaces
]
# check if those namespaces really exist in this wiki
namespaces2 = []
for i in m:
if int(i.group("namespaceid")) in namespaces:
namespaces2.append(int(i.group("namespaceid")))
namespacenames[int(i.group("namespaceid"))] = i.group(
"namespacename"
)
namespaces = namespaces2
else:
namespaces = [0]
namespaces = list(set(namespaces)) # uniques
print("%d namespaces found" % (len(namespaces)))
return namespaces
return namespaces, namespacenames
def getNamespacesAPI(config: Config, session: requests.Session):
def getNamespacesAPI(config: Config = None, session=None):
"""Uses the API to get the list of namespaces names and ids"""
namespaces = config.namespaces
# namespacenames = {0: ""} # main is 0, no prefix
namespacenames = {0: ""} # main is 0, no prefix
if namespaces:
r = session.get(
url=config.api,
@ -57,34 +60,37 @@ def getNamespacesAPI(config: Config, session: requests.Session):
timeout=30,
)
result = getJSON(r)
Delay(config=config)
Delay(config=config, session=session)
try:
nsquery = result["query"]["namespaces"]
except KeyError as ke:
except KeyError:
print("Error: could not get namespaces from the API request.")
print("HTTP %d" % r.status_code)
print(r.text)
raise ke
return None
if "all" in namespaces:
namespaces = [int(i) for i in nsquery.keys() if int(i) >= 0]
# -1: Special, -2: Media, excluding
# namespacenames[int(i)] = nsquery[i]["*"]
namespaces = []
for i in nsquery.keys():
if int(i) < 0: # -1: Special, -2: Media, excluding
continue
namespaces.append(int(i))
namespacenames[int(i)] = nsquery[i]["*"]
else:
# check if those namespaces really exist in this wiki
namespaces2 = []
for i in nsquery.keys():
# bi = i
bi = i
i = int(i)
if i < 0: # -1: Special, -2: Media, excluding
continue
if i in namespaces:
namespaces2.append(i)
# namespacenames[i] = nsquery[bi]["*"]
namespacenames[i] = nsquery[bi]["*"]
namespaces = namespaces2
else:
namespaces = [0]
namespaces = list(set(namespaces)) # uniques
print("%d namespaces found" % (len(namespaces)))
return namespaces
return namespaces, namespacenames

@ -1,11 +1,9 @@
import re
from typing import List
import sys
from urllib.parse import urlparse
import mwclient
import requests
from file_read_backwards import FileReadBackwards
from mwclient.page import Page
from wikiteam3.dumpgenerator.api.namespaces import (
getNamespacesAPI,
@ -17,10 +15,10 @@ from wikiteam3.utils import cleanHTML, domain2prefix, undoHTMLEntities
from wikiteam3.utils.monkey_patch import DelaySession
def getPageTitlesAPI(config: Config, session: requests.Session):
def getPageTitlesAPI(config: Config = None, session=None):
"""Uses the API to get the list of page titles"""
titles = []
namespaces: List[int] = getNamespacesAPI(config=config, session=session)
namespaces, namespacenames = getNamespacesAPI(config=config, session=session)
# apply delay to the session for mwclient.Site.allpages()
delay_session = DelaySession(
@ -40,11 +38,10 @@ def getPageTitlesAPI(config: Config, session: requests.Session):
scheme=apiurl.scheme,
pool=session,
)
for page in site.allpages(namespace=str(namespace)):
if page is Page:
title = page.name
titles.append(title)
yield title
for page in site.allpages(namespace=namespace):
title = page.name
titles.append(title)
yield title
if len(titles) != len(set(titles)):
print("Probably a loop, switching to next namespace")
@ -53,10 +50,10 @@ def getPageTitlesAPI(config: Config, session: requests.Session):
delay_session.release()
def getPageTitlesScraper(config: Config, session: requests.Session):
def getPageTitlesScraper(config: Config = None, session=None):
"""Scrape the list of page titles from Special:Allpages"""
titles = []
namespaces = getNamespacesScraper(config=config, session=session)
namespaces, namespacenames = getNamespacesScraper(config=config, session=session)
r_title = r'title="(?P<title>[^>]+)">'
r_suballpages1 = r'&amp;from=(?P<from>[^>"]+)&amp;to=(?P<to>[^>"]+)">'
r_suballpages2 = r'Special:Allpages/(?P<from>[^>"]+)">'
@ -78,7 +75,7 @@ def getPageTitlesScraper(config: Config, session: requests.Session):
elif re.search(r_suballpages3, raw):
r_suballpages = r_suballpages3
c = 0
# oldfr = ""
oldfr = ""
checked_suballpages = []
rawacum = raw
while r_suballpages and re.search(r_suballpages, raw) and c < deep:
@ -108,10 +105,10 @@ def getPageTitlesScraper(config: Config, session: requests.Session):
if name not in checked_suballpages:
# to avoid reload dupe subpages links
checked_suballpages.append(name)
Delay(config=config)
Delay(config=config, session=session)
# print ('Fetching URL: ', url)
r = session.get(url=url, timeout=10)
raw = r.text
raw = str(r.text)
raw = cleanHTML(raw)
rawacum += raw # merge it after removed junk
print(
@ -125,26 +122,27 @@ def getPageTitlesScraper(config: Config, session: requests.Session):
"pages",
)
Delay(config=config)
Delay(config=config, session=session)
assert (
currfr is not None
), "re.search found the pattern, but re.finditer fails, why?"
# oldfr = currfr
oldfr = currfr
c += 1
c = 0
m = re.compile(r_title).finditer(rawacum)
for i in m:
t = undoHTMLEntities(text=i.group("title"))
if not t.startswith("Special:") and t not in titles:
titles.append(t)
c += 1
if not t.startswith("Special:"):
if t not in titles:
titles.append(t)
c += 1
print(" %d titles retrieved in the namespace %d" % (c, namespace))
return titles
def getPageTitles(config: Config, session: requests.Session):
def getPageTitles(config: Config = None, session=None):
"""Get list of page titles"""
# http://en.wikipedia.org/wiki/Special:AllPages
# http://wiki.archiveteam.org/index.php?title=Special:AllPages
@ -170,7 +168,7 @@ def getPageTitles(config: Config, session: requests.Session):
if config.api:
try:
titles = getPageTitlesAPI(config=config, session=session)
except Exception:
except:
print("Error: could not get page titles from the API")
titles = getPageTitlesScraper(config=config, session=session)
elif config.index:
@ -195,7 +193,7 @@ def getPageTitles(config: Config, session: requests.Session):
def checkTitleOk(
config: Config,
config: Config = None,
):
try:
with FileReadBackwards(
@ -210,13 +208,13 @@ def checkTitleOk(
lasttitle = frb.readline().strip()
if lasttitle == "":
lasttitle = frb.readline().strip()
except Exception:
except:
lasttitle = "" # probably file does not exists
return lasttitle == "--END--"
def readTitles(config: Config, session: requests.Session, start: str, batch: bool):
def readTitles(config: Config = None, session=None, start=None, batch=False):
"""Read title list from a file, from the title "start" """
if not checkTitleOk(config):
getPageTitles(config=config, session=session)
@ -227,7 +225,7 @@ def readTitles(config: Config, session: requests.Session, start: str, batch: boo
titlesfile = open(f"{config.path}/{titlesfilename}", encoding="utf-8")
titlelist = []
seeking = start != ""
seeking = start is not None
with titlesfile as f:
for line in f:
title = line.strip()

@ -5,13 +5,13 @@ import requests
from wikiteam3.utils import getUserAgent
def getWikiEngine(url: str, session: requests.Session) -> str:
def getWikiEngine(url="", session: requests.Session = None) -> str:
"""Returns the wiki engine of a URL, if known"""
if not session:
session = requests.Session() # Create a new session
session.headers.update({"User-Agent": getUserAgent()})
r = session.post(url=url, timeout=30) # type: ignore
r = session.post(url=url, timeout=30)
if r.status_code == 405 or not r.text:
r = session.get(url=url, timeout=120)
result = r.text

@ -6,7 +6,7 @@ import os
import queue
import re
import sys
from typing import Any, Dict, Literal, Tuple
from typing import *
import requests
import urllib3
@ -15,9 +15,10 @@ from wikiteam3.dumpgenerator.api import checkRetryAPI, getWikiEngine, mwGetAPIAn
from wikiteam3.dumpgenerator.api.index_check import checkIndex
from wikiteam3.dumpgenerator.config import Config, newConfig
from wikiteam3.dumpgenerator.version import getVersion
from wikiteam3.utils import domain2prefix, getUserAgent, mod_requests_text, uniLogin
from wikiteam3.utils.user_agent import setupUserAgent
from wikiteam3.utils import domain2prefix, getUserAgent, mod_requests_text
from wikiteam3.utils.login import uniLogin
from ...utils.user_agent import setupUserAgent
from .delay import Delay
@ -222,13 +223,13 @@ def getParameters(params=None) -> Tuple[Config, Dict]:
########################################
# Create session
mod_requests_text(requests) # type: ignore # monkey patch
mod_requests_text(requests) # monkey patch
session = requests.Session()
# Disable SSL verification
if args.insecure:
session.verify = False
urllib3.disable_warnings()
requests.packages.urllib3.disable_warnings()
print("WARNING: SSL certificate verification disabled")
# Custom session retry
@ -240,12 +241,14 @@ def getParameters(params=None) -> Tuple[Config, Dict]:
class CustomRetry(Retry):
def increment(self, method=None, url=None, *args, **kwargs):
if "_pool" in kwargs:
conn: urllib3.connectionpool.HTTPSConnectionPool = kwargs["_pool"]
conn = kwargs[
"_pool"
] # type: urllib3.connectionpool.HTTPSConnectionPool
if "response" in kwargs:
try:
# drain conn in advance so that it won't be put back into conn.pool
kwargs["response"].drain_conn()
except Exception:
except:
pass
# Useless, retry happens inside urllib3
# for adapters in session.adapters.values():
@ -253,12 +256,12 @@ def getParameters(params=None) -> Tuple[Config, Dict]:
# adapters.poolmanager.clear()
# Close existing connection so that a new connection will be used
if hasattr(conn, "pool") and conn.pool is not None:
if hasattr(conn, "pool"):
pool = conn.pool # type: queue.Queue
try:
# Don't directly use this, This closes connection pool by making conn.pool = None
conn.close()
except Exception:
except:
pass
conn.pool = pool
return super().increment(method=method, url=url, *args, **kwargs)
@ -271,8 +274,7 @@ def getParameters(params=None) -> Tuple[Config, Dict]:
msg = "req retry (%s)" % response.status
else:
msg = None
# config=None
Delay(config=config, msg=msg, delay=backoff)
Delay(config=None, session=session, msg=msg, delay=backoff)
__retries__ = CustomRetry(
total=int(args.retries),
@ -290,7 +292,7 @@ def getParameters(params=None) -> Tuple[Config, Dict]:
)
session.mount("https://", HTTPAdapter(max_retries=__retries__))
session.mount("http://", HTTPAdapter(max_retries=__retries__))
except Exception:
except:
# Our urllib3/requests is too old
pass
@ -299,7 +301,7 @@ def getParameters(params=None) -> Tuple[Config, Dict]:
if args.cookies:
cj.load(args.cookies)
print("Using cookies from %s" % args.cookies)
session.cookies = cj # type: ignore
session.cookies = cj
# Setup user agent
session.headers.update({"User-Agent": getUserAgent()})
@ -310,17 +312,17 @@ def getParameters(params=None) -> Tuple[Config, Dict]:
session.auth = (args.user, args.password)
# Execute meta info params
if args.wiki and args.get_wiki_engine:
print(getWikiEngine(url=args.wiki, session=session))
sys.exit(0)
if args.wiki:
if args.get_wiki_engine:
print(getWikiEngine(url=args.wiki, session=session))
sys.exit(0)
# Get API and index and verify
api: str = args.api or ""
index: str = args.index or ""
api = args.api if args.api else ""
index = args.index if args.index else ""
if api == "" or index == "":
if args.wiki:
if getWikiEngine(args.wiki, session=session) == "MediaWiki":
index2: str
api2, index2 = mwGetAPIAndIndex(args.wiki, session=session)
if not api:
api = api2
@ -337,12 +339,9 @@ def getParameters(params=None) -> Tuple[Config, Dict]:
# print (api)
# print (index)
index2 = ""
index2 = None
check: (
tuple[Literal[True], Any, str] | tuple[Literal[True], None, str] | None
) = False # type: ignore
checkedapi = ""
check, checkedapi = False, None
if api:
check, checkedapi = checkRetryAPI(
api=api,
@ -350,9 +349,9 @@ def getParameters(params=None) -> Tuple[Config, Dict]:
session=session,
)
if api != "" and check:
if api and check:
# Replace the index URL we got from the API check
index2 = str(check[1])
index2 = check[1]
api = checkedapi
print("API is OK: ", checkedapi)
else:
@ -392,10 +391,8 @@ def getParameters(params=None) -> Tuple[Config, Dict]:
try:
index = "/".join(index.split("/")[:-1])
except AttributeError:
index = ""
if index != "" and checkIndex(
index=index, cookies=args.cookies, session=session
):
index = None
if index and checkIndex(index=index, cookies=args.cookies, session=session):
print("index.php is OK")
else:
print("Error in index.php.")
@ -476,7 +473,7 @@ def getParameters(params=None) -> Tuple[Config, Dict]:
# calculating path, if not defined by user with --path=
if not config.path:
config.path = "./{}-{}-wikidump".format(
domain2prefix(config=config),
domain2prefix(config=config, session=session),
config.date,
)
print("No --path argument provided. Defaulting to:")

@ -1,3 +1,5 @@
import itertools
import sys
import threading
import time
@ -19,7 +21,7 @@ class Delay:
time.sleep(0.3)
def __init__(self, config: Config, msg=None, delay=None):
def __init__(self, config: Config = None, session=None, msg=None, delay=None):
"""Add a delay if configured for that"""
self.ellipses: str = "."

@ -19,12 +19,10 @@ config = {
}
"""
import contextlib
import dataclasses
import json
import sys
from dataclasses import field
from typing import List
from typing import *
def _dataclass_from_dict(klass_or_obj, d):
@ -45,7 +43,7 @@ class Config:
retries: int = 0
path: str = ""
logs: bool = False
date: str = ""
date: str = False
# URL params
index: str = ""
@ -58,8 +56,8 @@ class Config:
xmlrevisions: bool = False
xmlrevisions_page: bool = False
images: bool = False
namespaces: List[int] = field(default_factory=lambda: [])
exnamespaces: List[int] = field(default_factory=lambda: [])
namespaces: List[int] = None
exnamespaces: List[int] = None
api_chunksize: int = 0 # arvlimit, ailimit, etc
export: str = "" # Special:Export page name
@ -75,21 +73,24 @@ def newConfig(configDict) -> Config:
return _dataclass_from_dict(Config, configDict)
def loadConfig(config: Config, configfilename=""):
def loadConfig(config: Config = None, configfilename=""):
"""Load config file"""
configDict = dataclasses.asdict(config)
if config.path:
with contextlib.suppress(Exception):
try:
with open(f"{config.path}/{configfilename}", encoding="utf-8") as infile:
configDict.update(json.load(infile))
return newConfig(configDict)
except:
pass
print("There is no config file. we can't resume. Start a new dump.")
sys.exit()
def saveConfig(config: Config, configfilename=""):
def saveConfig(config: Config = None, configfilename=""):
"""Save config file"""
with open(f"{config.path}/{configfilename}", "w", encoding="utf-8") as outfile:

@ -1,12 +1,10 @@
try:
import contextlib
# import http.cookiejar
import http.cookiejar
import os
import re
import sys
import traceback
from typing import List
from file_read_backwards import FileReadBackwards
@ -22,7 +20,7 @@ except ImportError:
)
sys.exit(1)
from typing import Dict
from typing import *
from wikiteam3.dumpgenerator.cli import bye, getParameters, welcome
from wikiteam3.dumpgenerator.config import Config, loadConfig, saveConfig
@ -77,7 +75,7 @@ class DumpGenerator:
else contextlib.nullcontext()
):
print(welcome())
print(f"Analysing {config.api or config.index}")
print(f"Analysing {config.api if config.api else config.index}")
# creating path or resuming if desired
c = 2
@ -126,58 +124,57 @@ class DumpGenerator:
bye()
@staticmethod
def createNewDump(config: Config, other: Dict):
# other: Dict = None
def createNewDump(config: Config = None, other: Dict = None):
# we do lazy title dumping here :)
images = []
print("Trying generating a new dump into a new directory...")
if config.xml:
generateXMLDump(config=config, resume=False, session=other["session"])
generateXMLDump(config=config, session=other["session"])
checkXMLIntegrity(config=config, session=other["session"])
if config.images:
images += Image.getImageNames(config=config, session=other["session"])
Image.saveImageNames(config=config, images=images)
Image.saveImageNames(config=config, images=images, session=other["session"])
Image.generateImageDump(
config=config, other=other, images=images, session=other["session"]
)
if config.logs:
saveLogs(config=config, session=other["session"])
# other: Dict = None
@staticmethod
def resumePreviousDump(config: Config, other: Dict):
images: List[str] = []
def resumePreviousDump(config: Config = None, other: Dict = None):
images = []
print("Resuming previous dump process...")
if config.xml:
# checking xml dump
xmliscomplete = False
lastxmltitle = None
lastxmlrevid = None
# Exception means probably file does not exist
with contextlib.suppress(Exception):
try:
with FileReadBackwards(
"%s/%s-%s-%s.xml"
% (
config.path,
domain2prefix(config=config),
domain2prefix(config=config, session=other["session"]),
config.date,
"current" if config.curonly else "history",
),
encoding="utf-8",
) as frb:
for line in frb:
if line.strip() == "</mediawiki>":
for l in frb:
if l.strip() == "</mediawiki>":
# xml dump is complete
xmliscomplete = True
break
if xmlrevid := re.search(r" <id>([^<]+)</id>", line):
if xmlrevid := re.search(r" <id>([^<]+)</id>", l):
lastxmlrevid = int(xmlrevid.group(1))
if xmltitle := re.search(r"<title>([^<]+)</title>", line):
if xmltitle := re.search(r"<title>([^<]+)</title>", l):
lastxmltitle = undoHTMLEntities(text=xmltitle.group(1))
break
except:
pass # probably file does not exists
if xmliscomplete:
print("XML dump was completed in the previous session")
elif lastxmltitle:
@ -193,7 +190,7 @@ class DumpGenerator:
else:
# corrupt? only has XML header?
print("XML is corrupt? Regenerating...")
generateXMLDump(config=config, resume=False, session=other["session"])
generateXMLDump(config=config, session=other["session"])
if config.images:
# load images list
@ -206,9 +203,7 @@ class DumpGenerator:
if os.path.exists(imagesFilePath):
with open(imagesFilePath) as f:
lines = f.read().splitlines()
images.extend(
line.split("\t") for line in lines if re.search(r"\t", line)
)
images.extend(l.split("\t") for l in lines if re.search(r"\t", l))
if len(lines) == 0: # empty file
lastimage = "--EMPTY--"
if not lastimage:
@ -231,14 +226,16 @@ class DumpGenerator:
Image.saveImageNames(config=config, images=images)
# checking images directory
listdir = []
with contextlib.suppress(OSError):
try:
listdir = os.listdir(f"{config.path}/images")
except OSError:
pass # probably directory does not exist
listdir = set(listdir)
c_desc = 0
c_images = 0
c_checked = 0
for filename, url, uploader, size, sha1 in images:
# lastfilename = filename
lastfilename = filename
if other["filenamelimit"] < len(filename.encode("utf-8")):
logerror(
config=config,

@ -4,7 +4,7 @@ import re
import sys
import time
import urllib.parse
from typing import Dict, List
from typing import Dict, List, Optional
import requests
@ -20,19 +20,19 @@ from wikiteam3.utils import cleanHTML, domain2prefix, sha1File, undoHTMLEntities
class Image:
@staticmethod
def getXMLFileDesc(config: Config, title: str, session: requests.Session):
def getXMLFileDesc(config: Config = None, title="", session=None):
"""Get XML for image description page"""
config.curonly = True # tricky to get only the most recent desc
config.curonly = 1 # tricky to get only the most recent desc
return "".join(
list(getXMLPage(config=config, title=title, verbose=False, session=session))
)
# other: Dict = None,
# images: List[List] = None,
# session: requests.Session = None,
@staticmethod
def generateImageDump(
config: Config, other: Dict, images: List[List], session: requests.Session
config: Config = None,
other: Dict = None,
images: List[List] = None,
session: requests.Session = None,
):
"""Save files and descriptions using a file list\n
Deprecated: `start` is not used anymore."""
@ -49,9 +49,7 @@ class Image:
bypass_cdn_image_compression: bool = other["bypass_cdn_image_compression"]
def modify_params(
params: Dict[str, (str | int)] = {}
) -> Dict[str, (str | int)]:
def modify_params(params: Optional[Dict] = None) -> Dict:
"""bypass Cloudflare Polish (image optimization)"""
if params is None:
params = {}
@ -103,7 +101,7 @@ class Image:
+ "we will not try to download it...",
)
else:
Delay(config=config)
Delay(config=config, session=session)
original_url = url
r = session.head(url=url, params=modify_params(), allow_redirects=True)
check_response(r)
@ -118,20 +116,17 @@ class Image:
check_response(r)
# Try to fix a broken HTTP to HTTPS redirect
if (
r.status_code == 404
and original_url_redirected
and (
if r.status_code == 404 and original_url_redirected:
if (
original_url.split("://")[0] == "http"
and url.split("://")[0] == "https"
)
):
url = "https://" + original_url.split("://")[1]
# print 'Maybe a broken http to https redirect, trying ', url
r = session.get(
url=url, params=modify_params(), allow_redirects=False
)
check_response(r)
):
url = "https://" + original_url.split("://")[1]
# print 'Maybe a broken http to https redirect, trying ', url
r = session.get(
url=url, params=modify_params(), allow_redirects=False
)
check_response(r)
if r.status_code == 200:
try:
@ -165,7 +160,7 @@ class Image:
if os.path.isfile(f"{filename3}.desc"):
toContinue += 1
else:
Delay(config=config)
Delay(config=config, session=session)
# saving description if any
title = f"Image:{filename}"
try:
@ -236,7 +231,7 @@ class Image:
)
@staticmethod
def getImageNames(config: Config, session: requests.Session):
def getImageNames(config: Config = None, session: requests.Session = None):
"""Get list of image names"""
print(")Retrieving image filenames")
@ -256,7 +251,7 @@ class Image:
return images
@staticmethod
def getImageNamesScraper(config: Config, session: requests.Session):
def getImageNamesScraper(config: Config = None, session: requests.Session = None):
"""Retrieve file list: filename, url, uploader"""
images = []
@ -273,7 +268,7 @@ class Image:
timeout=30,
)
raw = r.text
Delay(config=config)
Delay(config=config, session=session)
# delicate wiki
if re.search(
r"(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)",
@ -350,7 +345,7 @@ class Image:
return images
@staticmethod
def getImageNamesAPI(config: Config, session: requests.Session):
def getImageNamesAPI(config: Config = None, session: requests.Session = None):
"""Retrieve file list: filename, url, uploader, size, sha1"""
# # Commented by @yzqzss:
# https://www.mediawiki.org/wiki/API:Allpages
@ -382,7 +377,7 @@ class Image:
r = session.get(url=config.api, params=params, timeout=30)
handleStatusCode(r)
jsonimages = getJSON(r)
Delay(config=config)
Delay(config=config, session=session)
if "query" in jsonimages:
countImages += len(jsonimages["query"]["allimages"])
@ -470,7 +465,7 @@ class Image:
r = session.get(url=config.api, params=params, timeout=30)
handleStatusCode(r)
jsonimages = getJSON(r)
Delay(config=config)
Delay(config=config, session=session)
if "query" not in jsonimages:
# if the API doesn't return query data, then we're done
@ -517,7 +512,7 @@ class Image:
return images
@staticmethod
def saveImageNames(config: Config, images: List[List]):
def saveImageNames(config: Config = None, images: List[List] = None, session=None):
"""Save image list in a file, including filename, url, uploader, size and sha1"""
imagesfilename = "{}-{}-images.txt".format(
@ -550,7 +545,7 @@ class Image:
print("Image filenames and URLs saved at...", imagesfilename)
@staticmethod
def curateImageURL(config: Config, url=""):
def curateImageURL(config: Config = None, url=""):
"""Returns an absolute URL for an image, adding the domain if missing"""
if config.index:

@ -1,22 +1,20 @@
import os
import requests
from wikiteam3.dumpgenerator.cli import Delay
from wikiteam3.dumpgenerator.config import Config
from wikiteam3.utils import removeIP
def saveIndexPHP(config: Config, session: requests.Session):
def saveIndexPHP(config: Config = None, session=None):
"""Save index.php as .html, to preserve license details available at the botom of the page"""
if os.path.exists(f"{config.path}/index.html"):
print("index.html exists, do not overwrite")
else:
print("Downloading index.php (Main Page) as index.html")
r = session.post(url=config.index, params=None, timeout=10) # type: ignore
raw = r.text
Delay(config=config)
r = session.post(url=config.index, params=None, timeout=10)
raw = str(r.text)
Delay(config=config, session=session)
raw = removeIP(raw=raw)
with open(f"{config.path}/index.html", "w", encoding="utf-8") as outfile:
outfile.write(raw)

@ -1,61 +1,58 @@
import json
import os
import requests
from wikiteam3.dumpgenerator.api import getJSON
from wikiteam3.dumpgenerator.cli import Delay
from wikiteam3.dumpgenerator.config import Config
def saveSiteInfo(config: Config, session: requests.Session):
def saveSiteInfo(config: Config = None, session=None):
"""Save a file with site info"""
if not config.api:
return
if os.path.exists(f"{config.path}/siteinfo.json"):
print("siteinfo.json exists, do not overwrite")
return
print("Downloading site info as siteinfo.json")
else:
print("Downloading site info as siteinfo.json")
# MediaWiki 1.13+
r = session.get(
url=config.api,
params={
"action": "query",
"meta": "siteinfo",
"siprop": "general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo",
"sinumberingroup": 1,
"format": "json",
},
timeout=10,
)
# MediaWiki 1.11-1.12
if "query" not in getJSON(r):
r = session.get(
url=config.api,
params={
"action": "query",
"meta": "siteinfo",
"siprop": "general|namespaces|statistics|dbrepllag|interwikimap",
"format": "json",
},
timeout=10,
)
# MediaWiki 1.8-1.10
if "query" not in getJSON(r):
# MediaWiki 1.13+
r = session.get(
url=config.api,
params={
"action": "query",
"meta": "siteinfo",
"siprop": "general|namespaces",
"siprop": "general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo",
"sinumberingroup": 1,
"format": "json",
},
timeout=10,
)
result = getJSON(r)
Delay(config=config)
with open(f"{config.path}/siteinfo.json", "w", encoding="utf-8") as outfile:
outfile.write(json.dumps(result, indent=4, sort_keys=True))
# MediaWiki 1.11-1.12
if "query" not in getJSON(r):
r = session.get(
url=config.api,
params={
"action": "query",
"meta": "siteinfo",
"siprop": "general|namespaces|statistics|dbrepllag|interwikimap",
"format": "json",
},
timeout=10,
)
# MediaWiki 1.8-1.10
if "query" not in getJSON(r):
r = session.get(
url=config.api,
params={
"action": "query",
"meta": "siteinfo",
"siprop": "general|namespaces",
"format": "json",
},
timeout=10,
)
result = getJSON(r)
Delay(config=config, session=session)
with open(f"{config.path}/siteinfo.json", "w", encoding="utf-8") as outfile:
outfile.write(json.dumps(result, indent=4, sort_keys=True))

@ -1,10 +1,8 @@
import requests
from wikiteam3.dumpgenerator.cli import Delay
from wikiteam3.dumpgenerator.config import Config
def saveLogs(config: Config, session: requests.Session):
def saveLogs(config: Config = None, session=None):
"""Save Special:Log"""
# get all logs from Special:Log
"""parse
@ -22,4 +20,4 @@ def saveLogs(config: Config, session: requests.Session):
<option value="">Todos los registros</option>
</select>
"""
Delay(config=config)
Delay(config=config, session=session)

@ -1,13 +1,11 @@
import os
import requests
from wikiteam3.dumpgenerator.cli import Delay
from wikiteam3.dumpgenerator.config import Config
from wikiteam3.utils import removeIP
def saveSpecialVersion(config: Config, session: requests.Session):
def saveSpecialVersion(config: Config = None, session=None):
"""Save Special:Version as .html, to preserve extensions details"""
if os.path.exists(f"{config.path}/SpecialVersion.html"):
@ -15,10 +13,10 @@ def saveSpecialVersion(config: Config, session: requests.Session):
else:
print("Downloading Special:Version with extensions and other related info")
r = session.post(
url=config.index, params={"title": "Special:Version"}, timeout=10 # type: ignore
url=config.index, params={"title": "Special:Version"}, timeout=10
)
raw = r.text
Delay(config=config)
raw = str(r.text)
Delay(config=config, session=session)
raw = str(removeIP(raw=raw))
with open(
f"{config.path}/SpecialVersion.html", "w", encoding="utf-8"

@ -1,13 +1,10 @@
import requests
from wikiteam3.dumpgenerator.config import Config
from .page_xml_api import getXMLPageWithApi
from .page_xml_export import getXMLPageWithExport
# title="", verbose=True
def getXMLPage(config: Config, title: str, verbose: bool, session: requests.Session):
def getXMLPage(config: Config = None, title="", verbose=True, session=None):
if config.xmlapiexport:
return getXMLPageWithApi(
config=config, title=title, verbose=verbose, session=session

@ -1,7 +1,7 @@
import re
import time
import traceback
from typing import Dict
from typing import *
import requests
@ -11,71 +11,58 @@ from wikiteam3.dumpgenerator.exceptions import ExportAbortedError, PageMissingEr
from wikiteam3.dumpgenerator.log import logerror
try:
import xml.etree.ElementTree as ElementTree
import xml.etree.ElementTree as ET
except ImportError:
import xml.etree.ElementTree as ElementTree
import xml.etree.ElementTree as ET
import xml.dom.minidom as MD
def reconstructRevisions(root: ElementTree.Element):
# print ElementTree.tostring(rev)
page = ElementTree.Element("stub")
def reconstructRevisions(root=None):
# print ET.tostring(rev)
page = ET.Element("stub")
edits = 0
query: (ElementTree.Element | None) = root.find("query")
if query is None:
raise ValueError("query was none")
pages: (ElementTree.Element | None) = query.find("pages")
if pages is None:
raise ValueError("pages was none")
page_element: (ElementTree.Element | None) = query.find("page")
if page_element is None:
raise ValueError("page was none")
revisions: (ElementTree.Element | None) = page_element.find("revisions")
if revisions is None:
raise ValueError("revisions was none")
for rev in revisions.findall("rev"):
for rev in (
root.find("query").find("pages").find("page").find("revisions").findall("rev")
):
try:
rev_ = ElementTree.SubElement(page, "revision")
rev_ = ET.SubElement(page, "revision")
# id
ElementTree.SubElement(rev_, "id").text = rev.attrib["revid"]
ET.SubElement(rev_, "id").text = rev.attrib["revid"]
# parentid (optional, export-0.7+)
if "parentid" in rev.attrib:
ElementTree.SubElement(rev_, "parentid").text = rev.attrib["parentid"]
ET.SubElement(rev_, "parentid").text = rev.attrib["parentid"]
# timestamp
ElementTree.SubElement(rev_, "timestamp").text = rev.attrib["timestamp"]
ET.SubElement(rev_, "timestamp").text = rev.attrib["timestamp"]
# contributor
contributor = ElementTree.SubElement(rev_, "contributor")
contributor = ET.SubElement(rev_, "contributor")
if "userhidden" not in rev.attrib:
ElementTree.SubElement(contributor, "username").text = rev.attrib[
"user"
]
ElementTree.SubElement(contributor, "id").text = rev.attrib["userid"]
ET.SubElement(contributor, "username").text = rev.attrib["user"]
ET.SubElement(contributor, "id").text = rev.attrib["userid"]
else:
contributor.set("deleted", "deleted")
# comment (optional)
if "commenthidden" in rev.attrib:
print("commenthidden")
comment = ElementTree.SubElement(rev_, "comment")
comment = ET.SubElement(rev_, "comment")
comment.set("deleted", "deleted")
elif "comment" in rev.attrib and rev.attrib["comment"]: # '' is empty
comment = ElementTree.SubElement(rev_, "comment")
comment = ET.SubElement(rev_, "comment")
comment.text = rev.attrib["comment"]
# minor edit (optional)
if "minor" in rev.attrib:
ElementTree.SubElement(rev_, "minor")
ET.SubElement(rev_, "minor")
# model and format (optional, export-0.8+)
if "contentmodel" in rev.attrib:
ElementTree.SubElement(rev_, "model").text = rev.attrib[
ET.SubElement(rev_, "model").text = rev.attrib[
"contentmodel"
] # default: 'wikitext'
if "contentformat" in rev.attrib:
ElementTree.SubElement(rev_, "format").text = rev.attrib[
ET.SubElement(rev_, "format").text = rev.attrib[
"contentformat"
] # default: 'text/x-wiki'
# text
text = ElementTree.SubElement(rev_, "text")
text = ET.SubElement(rev_, "text")
if "texthidden" not in rev.attrib:
text.attrib["xml:space"] = "preserve"
text.attrib["bytes"] = rev.attrib["size"]
@ -85,28 +72,24 @@ def reconstructRevisions(root: ElementTree.Element):
text.set("deleted", "deleted")
# sha1
if "sha1" in rev.attrib:
sha1 = ElementTree.SubElement(rev_, "sha1")
sha1 = ET.SubElement(rev_, "sha1")
sha1.text = rev.attrib["sha1"]
elif "sha1hidden" in rev.attrib:
ElementTree.SubElement(rev_, "sha1") # stub
ET.SubElement(rev_, "sha1") # stub
edits += 1
except Exception as e:
# logerror(config=config, text='Error reconstructing revision, xml:%s' % (ElementTree.tostring(rev)))
print(ElementTree.tostring(rev))
# logerror(config=config, text='Error reconstructing revision, xml:%s' % (ET.tostring(rev)))
print(ET.tostring(rev))
traceback.print_exc()
page = None # type: ignore
page = None
edits = 0
raise e
return page, edits
# headers: Dict = None, params: Dict = None
def getXMLPageCoreWithApi(
headers: Dict,
params: Dict[str, (str | int)],
config: Config,
session: requests.Session,
headers: Dict = None, params: Dict = None, config: Config = None, session=None
):
""" """
# just send the API request
@ -118,7 +101,7 @@ def getXMLPageCoreWithApi(
increment = 20 # increment every retry
while not re.search(
r"</mediawiki>" if config.curonly else r"</api>", xml
r"</api>" if not config.curonly else r"</mediawiki>", xml
) or re.search(r"</error>", xml):
if c > 0 and c < maxretries:
wait = (
@ -131,8 +114,8 @@ def getXMLPageCoreWithApi(
time.sleep(wait)
# reducing server load requesting smallest chunks (if curonly then
# rvlimit = 1 from mother function)
if int(params["rvlimit"]) > 1:
params["rvlimit"] = int(params["rvlimit"]) // 2 # half
if params["rvlimit"] > 1:
params["rvlimit"] = params["rvlimit"] / 2 # half
if c >= maxretries:
print(" We have retried %d times" % (c))
print(
@ -147,7 +130,7 @@ def getXMLPageCoreWithApi(
print(" Saving in the errors log, and skipping...")
logerror(
config=config,
text=f'Error while retrieving the last revision of "{params["titles" if config.xmlapiexport else "pages"]}". Skipping.', # .decode("utf-8")
text=f'Error while retrieving the last revision of "{params["titles" if config.xmlapiexport else "pages"].decode("utf-8")}". Skipping.',
)
raise ExportAbortedError(config.index)
# FIXME HANDLE HTTP Errors HERE
@ -166,10 +149,7 @@ def getXMLPageCoreWithApi(
return xml
# title="", verbose=True
def getXMLPageWithApi(
config: Config, title: str, verbose: bool, session: requests.Session
):
def getXMLPageWithApi(config: Config = None, title="", verbose=True, session=None):
"""Get the full history (or current only) of a page using API:Query
if params['curonly'] is set, then using export&exportwrap to export
"""
@ -190,52 +170,42 @@ def getXMLPageWithApi(
"rvcontinue": None,
"rvlimit": config.api_chunksize,
}
firstpartok: bool = False
lastcontinue: str = ""
firstpartok = False
lastcontinue = None
numberofedits = 0
ret = ""
continueKey: str = ""
continueKey: Optional[str] = None
while True:
# in case the last request is not right, saving last time's progress
if not firstpartok:
try:
lastcontinue = params[continueKey]
except Exception:
lastcontinue = ""
except:
lastcontinue = None
xml = getXMLPageCoreWithApi(
headers={}, params=params, config=config, session=session
)
xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
if xml == "":
# just return so that we can continue, and getXMLPageCoreWithApi will log the error
return
try:
root = ElementTree.fromstring(xml.encode("utf-8"))
except Exception:
root = ET.fromstring(xml.encode("utf-8"))
except:
continue
try:
ret_query: (ElementTree.Element | None) = root.find("query")
if ret_query is None:
raise Exception("query was none")
ret_pages: (ElementTree.Element | None) = root.find("pages")
if ret_pages is None:
raise Exception("pages was none")
ret_page = ret_pages.find("page")
if ret_page is None:
continue
except Exception:
retpage = root.find("query").find("pages").find("page")
except:
continue
if "missing" in ret_page.attrib or "invalid" in ret_page.attrib:
if "missing" in retpage.attrib or "invalid" in retpage.attrib:
print("Page not found")
raise PageMissingError(params["titles"], xml)
if not firstpartok:
try:
# build the firstpart by ourselves to improve the memory usage
ret = " <page>\n"
ret += " <title>%s</title>\n" % (ret_page.attrib["title"])
ret += " <ns>%s</ns>\n" % (ret_page.attrib["ns"])
ret += " <id>%s</id>\n" % (ret_page.attrib["pageid"])
except Exception:
ret += " <title>%s</title>\n" % (retpage.attrib["title"])
ret += " <ns>%s</ns>\n" % (retpage.attrib["ns"])
ret += " <id>%s</id>\n" % (retpage.attrib["pageid"])
except:
firstpartok = False
continue
else:
@ -243,34 +213,30 @@ def getXMLPageWithApi(
yield ret
continueVal = None
continue_element: (ElementTree.Element | None) = root.find("continue")
query_continue_element: (ElementTree.Element | None) = root.find(
"query-continue"
)
if continue_element is not None:
if root.find("continue") is not None:
# uses continue.rvcontinue
# MW 1.26+
continueKey = "rvcontinue"
continueVal = continue_element.attrib["rvcontinue"]
elif query_continue_element is not None:
rev_continue = query_continue_element.find("revisions")
assert rev_continue is not None, "Should only have revisions continue"
if "rvcontinue" in rev_continue.attrib:
continueVal = root.find("continue").attrib["rvcontinue"]
elif root.find("query-continue") is not None:
revContinue = root.find("query-continue").find("revisions")
assert revContinue is not None, "Should only have revisions continue"
if "rvcontinue" in revContinue.attrib:
# MW 1.21 ~ 1.25
continueKey = "rvcontinue"
continueVal = rev_continue.attrib["rvcontinue"]
elif "rvstartid" in rev_continue.attrib:
continueVal = revContinue.attrib["rvcontinue"]
elif "rvstartid" in revContinue.attrib:
# TODO: MW ????
continueKey = "rvstartid"
continueVal = rev_continue.attrib["rvstartid"]
continueVal = revContinue.attrib["rvstartid"]
else:
# blindly assume the first attribute is the continue key
# may never happen
assert (
len(rev_continue.attrib) > 0
len(revContinue.attrib) > 0
), "Should have at least one attribute"
for continueKey in rev_continue.attrib.keys():
continueVal = rev_continue.attrib[continueKey]
for continueKey in revContinue.attrib.keys():
continueVal = revContinue.attrib[continueKey]
break
if continueVal is not None:
params[continueKey] = continueVal
@ -280,9 +246,7 @@ def getXMLPageWithApi(
# transform the revision
rev_, edits = reconstructRevisions(root=root)
xmldom = MD.parseString(
b"<stub1>" + ElementTree.tostring(rev_) + b"</stub1>"
)
xmldom = MD.parseString(b"<stub1>" + ET.tostring(rev_) + b"</stub1>")
# convert it into text in case it throws MemoryError
# delete the first three line and last two line,which is for setting the indent
ret += "".join(xmldom.toprettyxml(indent=" ").splitlines(True)[3:-2])
@ -290,7 +254,7 @@ def getXMLPageWithApi(
numberofedits += edits
if config.curonly or continueVal is None: # no continue
break
except Exception:
except:
traceback.print_exc()
params["rvcontinue"] = lastcontinue
ret = ""
@ -303,9 +267,7 @@ def getXMLPageWithApi(
"export": 1,
"exportnowrap": 1,
}
xml = getXMLPageCoreWithApi(
headers={}, params=params, config=config, session=session
)
xml = getXMLPageCoreWithApi(params=params, config=config, session=session)
if xml == "":
raise ExportAbortedError(config.index)
if "</page>" not in xml:

@ -1,7 +1,7 @@
import re
import sys
import time
from typing import Dict
from typing import *
import requests
@ -12,12 +12,8 @@ from wikiteam3.dumpgenerator.log import logerror
from wikiteam3.utils import uprint
# headers: Dict = None, params: Dict = None
def getXMLPageCore(
headers: Dict,
params: Dict[str, (str | int)],
config: Config,
session: requests.Session,
headers: Dict = None, params: Dict = None, config: Config = None, session=None
) -> str:
""""""
# returns a XML containing params['limit'] revisions (or current only), ending in </mediawiki>
@ -41,8 +37,8 @@ def getXMLPageCore(
time.sleep(wait)
# reducing server load requesting smallest chunks (if curonly then
# limit = 1 from mother function)
if int(params["limit"]) > 1:
params["limit"] = int(params["limit"]) // 2 # half
if params["limit"] > 1:
params["limit"] = params["limit"] / 2 # half
if c >= maxretries:
print(" We have retried %d times" % (c))
print(
@ -56,9 +52,9 @@ def getXMLPageCore(
# params['curonly'] should mean that we've already tried this
# fallback, because it's set by the following if and passed to
# getXMLPageCore
if not config.curonly: # and "curonly" not in params:
if not config.curonly and "curonly" not in params:
print(" Trying to save only the last revision for this page...")
params["curonly"] = True
params["curonly"] = 1
logerror(
config=config,
to_stdout=True,
@ -79,7 +75,7 @@ def getXMLPageCore(
try:
r = session.post(
url=config.index, params=params, headers=headers, timeout=10
) # type: ignore
)
handleStatusCode(r)
xml = r.text
except requests.exceptions.ConnectionError as e:
@ -93,9 +89,7 @@ def getXMLPageCore(
return xml
def getXMLPageWithExport(
config: Config, title: str, verbose: bool, session: requests.Session
):
def getXMLPageWithExport(config: Config = None, title="", verbose=True, session=None):
"""Get the full history (or current only) of a page"""
truncated = False
@ -103,17 +97,9 @@ def getXMLPageWithExport(
title_ = re.sub(" ", "_", title_)
# do not convert & into %26, title_ = re.sub('&', '%26', title_)
if config.export:
params: Dict[str, (str | int)] = {
"title": config.export,
"pages": title_,
"action": "submit",
}
params = {"title": config.export, "pages": title_, "action": "submit"}
else:
params = {
"title": "Special:Export",
"pages": title_,
"action": "submit",
}
params = {"title": "Special:Export", "pages": title_, "action": "submit"}
if config.curonly:
params["curonly"] = 1
params["limit"] = 1
@ -128,7 +114,7 @@ def getXMLPageWithExport(
if config.templates:
params["templates"] = 1
xml = getXMLPageCore(headers={}, params=params, config=config, session=session)
xml = getXMLPageCore(params=params, config=config, session=session)
if xml == "":
raise ExportAbortedError(config.index)
if "</page>" not in xml:
@ -153,12 +139,10 @@ def getXMLPageWithExport(
# get the last timestamp from the acum XML
params["offset"] = re.findall(r_timestamp, xml)[-1]
try:
xml2 = getXMLPageCore(
headers={}, params=params, config=config, session=session
)
xml2 = getXMLPageCore(params=params, config=config, session=session)
except MemoryError:
print("The page's history exceeds our memory, halving limit.")
params["limit"] = int(params["limit"]) // 2
params["limit"] /= 2
continue
# are there more edits in this next XML chunk or no <page></page>?
@ -193,7 +177,7 @@ def getXMLPageWithExport(
)
except MemoryError:
"The page's history exceeds our memory, halving limit."
params["limit"] = int(params["limit"]) // 2
params["limit"] /= 2
continue
xml = xml2
edit_count += len(re.findall(r_timestamp, xml))

@ -1,15 +1,14 @@
import sys
import time
from typing import List
from datetime import datetime
from typing import *
from urllib.parse import urlparse
import lxml.etree
import mwclient
import requests
from lxml.etree import _ElementTree as ElementTree
from mwclient.errors import InvalidResponse, MwClientError
# from wikiteam3.dumpgenerator.api.namespaces import getNamespacesAPI
from wikiteam3.dumpgenerator.api.namespaces import getNamespacesAPI
from wikiteam3.dumpgenerator.api.page_titles import readTitles
from wikiteam3.dumpgenerator.config import Config
from wikiteam3.dumpgenerator.dump.page.xmlrev.xml_revisions_page import (
@ -23,8 +22,9 @@ ALL_NAMESPACE = -1
def getXMLRevisionsByAllRevisions(
config: Config,
site: mwclient.Site, # = None,
config: Config = None,
session=None,
site: mwclient.Site = None,
nscontinue=None,
arvcontinue=None,
):
@ -62,7 +62,55 @@ def getXMLRevisionsByAllRevisions(
if _arvcontinue is not None:
arvparams["arvcontinue"] = _arvcontinue
if config.curonly:
if not config.curonly:
# We have to build the XML manually...
# Skip flags, presumably needed to add <minor/> which is in the schema.
# Also missing: parentid and contentformat.
arvparams[
"arvprop"
] = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags"
print(
"Trying to get wikitext from the allrevisions API and to build the XML"
)
while True:
try:
arvrequest = site.api(http_method=config.http_method, **arvparams)
except requests.exceptions.HTTPError as e:
if e.response.status_code != 405 or config.http_method != "POST":
raise
print("POST request to the API failed, retrying with GET")
config.http_method = "GET"
continue
except requests.exceptions.ReadTimeout as err:
# Hopefully temporary, just wait a bit and continue with the same request.
# No point putting a limit to retries, we'd need to abort everything.
# TODO: reuse the retry logic of the checkAPI phase? Or force mwclient
# to use the retry adapter we use for our own requests session?
print(f"ERROR: {str(err)}")
print("Sleeping for 20 seconds")
time.sleep(20)
continue
except mwclient.errors.InvalidResponse as e:
if (
not e.response_text.startswith("<!DOCTYPE html>")
or config.http_method != "POST"
):
raise
print(
"POST request to the API failed (got HTML), retrying with GET"
)
config.http_method = "GET"
continue
for page in arvrequest["query"]["allrevisions"]:
yield makeXmlFromPage(page, arvparams.get("arvcontinue", ""))
if "continue" in arvrequest:
arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"]
else:
# End of continuation. We are done with this namespace.
break
else:
# FIXME: this is not curonly, just different strategy to do all revisions
# Just cycle through revision IDs and use the XML as is
print("Trying to list the revisions and to export them one by one")
@ -141,69 +189,22 @@ def getXMLRevisionsByAllRevisions(
)
except requests.exceptions.ReadTimeout as err:
# As above
print(f"ERROR: {str(err)}\nSleeping for 20 seconds")
time.sleep(20)
# But avoid rewriting the same revisions
arvrequest["query"]["allrevisions"] = []
else:
# We have to build the XML manually...
# Skip flags, presumably needed to add <minor/> which is in the schema.
# Also missing: parentid and contentformat.
arvparams[
"arvprop"
] = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags"
print(
"Trying to get wikitext from the allrevisions API and to build the XML"
)
while True:
try:
arvrequest = site.api(http_method=config.http_method, **arvparams)
except requests.exceptions.HTTPError as e:
if e.response.status_code != 405 or config.http_method != "POST":
raise
print("POST request to the API failed, retrying with GET")
config.http_method = "GET"
continue
except requests.exceptions.ReadTimeout as err:
# Hopefully temporary, just wait a bit and continue with the same request.
# No point putting a limit to retries, we'd need to abort everything.
# TODO: reuse the retry logic of the checkAPI phase? Or force mwclient
# to use the retry adapter we use for our own requests session?
print(f"ERROR: {str(err)}")
print("Sleeping for 20 seconds")
time.sleep(20)
continue
except InvalidResponse as e:
if (
e.response_text is not None
and not e.response_text.startswith("<!DOCTYPE html>")
) or config.http_method != "POST":
raise
print(
"POST request to the API failed (got HTML), retrying with GET"
)
config.http_method = "GET"
continue
for page in arvrequest["query"]["allrevisions"]:
yield makeXmlFromPage(page, arvparams.get("arvcontinue", ""))
if "continue" in arvrequest:
arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"]
else:
# End of continuation. We are done with this namespace.
break
# But avoid rewriting the same revisions
arvrequest["query"]["allrevisions"] = []
def getXMLRevisionsByTitles(
config: Config, session: requests.Session, site: mwclient.Site, start: str
config: Config = None, session=None, site: mwclient.Site = None, start=None
):
c = 0
if config.curonly:
# The raw XML export in the API gets a title and gives the latest revision.
# We could also use the allpages API as generator but let's be consistent.
print("Getting titles to export the latest revision for each")
for title in readTitles(config, session=session, start=start, batch=False):
for title in readTitles(config, session=session, start=start):
# TODO: respect verbose flag, reuse output from getXMLPage
print(f" {title}")
# TODO: as we're doing one page and revision at a time, we might
@ -237,7 +238,7 @@ def getXMLRevisionsByTitles(
# The XML needs to be made manually because the export=1 option
# refuses to return an arbitrary number of revisions (see above).
print("Getting titles to export all the revisions of each")
titlelist: (str | List[str]) = []
titlelist = []
# TODO: Decide a suitable number of a batched request. Careful:
# batched responses may not return all revisions.
for titlelist in readTitles(config, session=session, start=start, batch=False):
@ -247,11 +248,9 @@ def getXMLRevisionsByTitles(
print(f" {title}")
# Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded:
# "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}}
if titlelist is List:
titlelist = "|".join(titlelist)
pparams = {
"action": "query",
"titles": titlelist,
"titles": "|".join(titlelist),
"prop": "revisions",
"rvlimit": config.api_chunksize,
"rvprop": "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags",
@ -264,13 +263,11 @@ def getXMLRevisionsByTitles(
print("POST request to the API failed, retrying with GET")
config.http_method = "GET"
prequest = site.api(http_method=config.http_method, **pparams)
except InvalidResponse:
if titlelist is List:
titlelist = "; ".join(titlelist)
except mwclient.errors.InvalidResponse:
logerror(
config=config,
to_stdout=True,
text=f"Error: page inaccessible? Could not export page: {titlelist}",
text=f'Error: page inaccessible? Could not export page: {"; ".join(titlelist)}',
)
continue
@ -282,12 +279,10 @@ def getXMLRevisionsByTitles(
try:
pages = prequest["query"]["pages"]
except KeyError:
if titlelist is List:
titlelist = "; ".join(titlelist)
logerror(
config=config,
to_stdout=True,
text=f"Error: page inaccessible? Could not export page: {titlelist}",
text=f'Error: page inaccessible? Could not export page: {"; ".join(titlelist)}',
)
break
# Go through the data we got to build the XML.
@ -295,12 +290,10 @@ def getXMLRevisionsByTitles(
try:
yield makeXmlFromPage(pages[pageid], None)
except PageMissingError:
if titlelist is List:
titlelist = "; ".join(titlelist)
logerror(
config=config,
to_stdout=True,
text=f"Error: empty revision from API. Could not export page: {titlelist}",
text=f'Error: empty revision from API. Could not export page: {"; ".join(titlelist)}',
)
continue
@ -331,12 +324,8 @@ def getXMLRevisionsByTitles(
print(f"\n-> Downloaded {c} pages\n")
# useAllrevision=True, lastPage=None
def getXMLRevisions(
config: Config,
session: requests.Session,
useAllrevision: bool,
lastPage: (ElementTree | None),
config: Config = None, session=None, useAllrevision=True, lastPage=None
):
# FIXME: actually figure out the various strategies for each MediaWiki version
apiurl = urlparse(config.api)
@ -353,7 +342,7 @@ def getXMLRevisions(
# Find last title
if lastPage is not None:
try:
lastNs = int(lastPage.find("ns", None).text)
lastNs = int(lastPage.find("ns").text)
lastArvcontinue = lastPage.attrib["arvcontinue"]
except Exception:
print(
@ -361,38 +350,43 @@ def getXMLRevisions(
)
raise
nscontinue = lastNs
arvcontinue = lastArvcontinue or None
arvcontinue = lastArvcontinue
if not arvcontinue:
arvcontinue = None
else:
nscontinue = None
arvcontinue = None
try:
return getXMLRevisionsByAllRevisions(config, site, nscontinue, arvcontinue)
except (KeyError, InvalidResponse) as e:
return getXMLRevisionsByAllRevisions(
config, session, site, nscontinue, arvcontinue
)
except (KeyError, mwclient.errors.InvalidResponse) as e:
print(e)
# TODO: check whether the KeyError was really for a missing arv API
print(
f"{str(e)}/nWarning. Could not use allrevisions. Wiki too old? Try to use --xmlrevisions_page"
"Warning. Could not use allrevisions. Wiki too old? Try to use --xmlrevisions_page"
)
sys.exit()
else:
# Find last title
if lastPage is not None:
try:
start = lastPage.find("title", None)
start = lastPage.find("title")
except Exception:
print(
f"Failed to find title in last trunk XML: {lxml.etree.tostring(lastPage)}"
)
raise
else:
start = ""
start = None
try:
# # Uncomment these lines to raise an KeyError for testing
# raise KeyError(999999)
# # DO NOT UNCOMMMENT IN RELEASE
return getXMLRevisionsByTitles(config, session, site, start)
except MwClientError as e:
except mwclient.errors.MwClientError as e:
print(e)
print("This mwclient version seems not to work for us. Exiting.")
sys.exit()

@ -6,7 +6,7 @@ from wikiteam3.dumpgenerator.exceptions import PageMissingError
def makeXmlPageFromRaw(xml, arvcontinue) -> str:
"""Discard the metadata around a <page> element in <mediawiki> string"""
root = etree.XML(text=xml, parser=None)
root = etree.XML(xml)
find = etree.XPath("//*[local-name() = 'page']")
page = find(root)[0]
if arvcontinue is not None:
@ -14,7 +14,7 @@ def makeXmlPageFromRaw(xml, arvcontinue) -> str:
# The tag will inherit the namespace, like:
# <page xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
# FIXME: pretty_print doesn't seem to work, only adds a newline
return etree.tostring(page, pretty_print=True, encoding="unicode") # type: ignore
return etree.tostring(page, pretty_print=True, encoding="unicode")
def makeXmlFromPage(page: dict, arvcontinue) -> str:
@ -124,4 +124,4 @@ def makeXmlFromPage(page: dict, arvcontinue) -> str:
except KeyError as e:
print(e)
raise PageMissingError(page["title"], e)
return etree.tostring(p, pretty_print=True, encoding="unicode") # type: ignore
return etree.tostring(p, pretty_print=True, encoding="unicode")

@ -1,12 +1,8 @@
import re
import sys
from io import TextIOWrapper
from typing import *
import lxml.etree
import requests
# from typing import *
from lxml.etree import _ElementTree as ElementTree
from wikiteam3.dumpgenerator.api.page_titles import readTitles
from wikiteam3.dumpgenerator.cli import Delay
@ -23,14 +19,12 @@ from wikiteam3.dumpgenerator.log import logerror
from wikiteam3.utils import cleanXML, domain2prefix, undoHTMLEntities
# lastPage=None,
# useAllrevisions=False,
def doXMLRevisionDump(
config: Config,
session: requests.Session,
xmlfile: TextIOWrapper,
lastPage: (ElementTree | None),
useAllrevisions: bool,
config: Config = None,
session=None,
xmlfile=None,
lastPage=None,
useAllrevisions=False,
):
try:
r_timestamp = "<timestamp>([^<]+)</timestamp>"
@ -47,17 +41,16 @@ def doXMLRevisionDump(
if arvcontinueRe := re.findall(r_arvcontinue, xml):
curArvcontinue = arvcontinueRe[0]
if lastArvcontinue != curArvcontinue:
Delay(config=config)
Delay(config=config, session=session)
lastArvcontinue = curArvcontinue
# Due to how generators work, it's expected this may be less
xml = cleanXML(xml=xml)
xmlfile.write(xml)
xmltitle = re.search(r"<title>([^<]+)</title>", xml)
if xmltitle is not None:
title = undoHTMLEntities(text=xmltitle[1])
print(f"{title}, {numrevs} edits (--xmlrevisions)")
# Delay(config=config)
title = undoHTMLEntities(text=xmltitle.group(1))
print(f"{title}, {numrevs} edits (--xmlrevisions)")
# Delay(config=config, session=session)
except AttributeError as e:
print(e)
print("This API library version is not working")
@ -66,13 +59,11 @@ def doXMLRevisionDump(
print(e)
def doXMLExportDump(
config: Config, session: requests.Session, xmlfile: TextIOWrapper, lastPage=None
):
def doXMLExportDump(config: Config = None, session=None, xmlfile=None, lastPage=None):
print("\nRetrieving the XML for every page\n")
lock = True
start: str = ""
start = None
if lastPage is not None:
try:
start = lastPage.find("title").text
@ -86,20 +77,18 @@ def doXMLExportDump(
lock = False
c = 1
for title in readTitles(config, session=session, start=start, batch=False):
if title is not str or title == "":
for title in readTitles(config, session=session, start=start):
if not title:
continue
if title == start: # start downloading from start, included
lock = False
if lock:
continue
Delay(config=config)
Delay(config=config, session=session)
if c % 10 == 0:
print(f"\n-> Downloaded {c} pages\n")
try:
for xml in getXMLPage(
config=config, verbose=True, title=title, session=session
):
for xml in getXMLPage(config=config, title=title, session=session):
xml = cleanXML(xml=xml)
xmlfile.write(xml)
except PageMissingError:
@ -115,8 +104,7 @@ def doXMLExportDump(
c += 1
# resume=False
def generateXMLDump(config: Config, resume: bool, session: requests.Session):
def generateXMLDump(config: Config = None, resume=False, session=None):
"""Generates a XML dump for a list of titles or from revision IDs"""
header, config = getXMLHeader(config=config, session=session)
@ -126,9 +114,9 @@ def generateXMLDump(config: Config, resume: bool, session: requests.Session):
config.date,
"current" if config.curonly else "history",
)
xmlfile: TextIOWrapper
xmlfile = None
lastPage: (ElementTree | None) = None
lastPage = None
lastPageChunk = None
# start != None, means we are resuming a XML dump
if resume:
@ -140,9 +128,8 @@ def generateXMLDump(config: Config, resume: bool, session: requests.Session):
resume = False
lastPage = None
else:
try:
lastPage = parseLastPageChunk(lastPageChunk)
except lxml.etree.LxmlError:
lastPage = parseLastPageChunk(lastPageChunk)
if lastPage is None:
print("Failed to parse last page chunk: \n%s" % lastPageChunk)
print("Cannot resume, exiting now!")
sys.exit(1)

@ -1,8 +1,7 @@
import contextlib
import json
import re
import sys
from typing import Tuple
from typing import *
import requests
@ -12,29 +11,31 @@ from wikiteam3.dumpgenerator.exceptions import ExportAbortedError, PageMissingEr
from wikiteam3.dumpgenerator.log import logerror
def getXMLHeader(config: Config, session: requests.Session) -> Tuple[str, Config]:
def getXMLHeader(config: Config = None, session=None) -> Tuple[str, Config]:
"""Retrieve a random page to extract XML headers (namespace info, etc)"""
print(config.api)
xml = ""
disableSpecialExport = config.xmlrevisions or config.xmlapiexport
randomtitle = "Main_Page"
if disableSpecialExport and config.api and config.api.endswith("api.php"):
with contextlib.suppress(requests.exceptions.RetryError):
try:
print("Getting the XML header from the API")
# Export and exportnowrap exist from MediaWiki 1.15, allpages from 1.8
r = session.get(
f"{config.api}?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1",
timeout=10,
)
xml = r.text
xml: str = r.text
# Otherwise try without exportnowrap, e.g. Wikia returns a blank page on 1.19
if not re.match(r"\s*<mediawiki", xml):
r = session.get(
f"{config.api}?action=query&export=1&list=allpages&aplimit=1&format=json",
timeout=10,
)
with contextlib.suppress(KeyError):
try:
xml = r.json()["query"]["export"]["*"]
except KeyError:
pass
if not re.match(r"\s*<mediawiki", xml):
# Do without a generator, use our usual trick of a random page title
r = session.get(
@ -48,8 +49,12 @@ def getXMLHeader(config: Config, session: requests.Session) -> Tuple[str, Config
f"{config.api}?action=query&export=1&format=json&titles={randomtitle}",
timeout=10,
)
with contextlib.suppress(KeyError):
try:
xml = r.json()["query"]["export"]["*"]
except KeyError:
pass
except requests.exceptions.RetryError:
pass
else:
try:
@ -67,36 +72,36 @@ def getXMLHeader(config: Config, session: requests.Session) -> Tuple[str, Config
# The <page> does not exist. Not a problem, if we get the <siteinfo>.
xml = pme.xml
except ExportAbortedError:
with contextlib.suppress(ExportAbortedError):
try:
if config.api:
print("Trying the local name for the Special namespace instead")
r = session.get(
url=config.api,
params={
"action": "query",
"meta": "siteinfo",
"siprop": "namespaces",
"format": "json",
},
timeout=120,
)
config.export = (
json.loads(r.text)["query"]["namespaces"]["-1"]["*"]
+ ":Export"
)
xml = "".join(
list(
getXMLPage(
config=config,
title=randomtitle,
verbose=False,
session=session,
)
try:
if config.api:
print("Trying the local name for the Special namespace instead")
r = session.get(
url=config.api,
params={
"action": "query",
"meta": "siteinfo",
"siprop": "namespaces",
"format": "json",
},
timeout=120,
)
config.export = (
json.loads(r.text)["query"]["namespaces"]["-1"]["*"] + ":Export"
)
xml = "".join(
list(
getXMLPage(
config=config,
title=randomtitle,
verbose=False,
session=session,
)
)
except PageMissingError as pme:
xml = pme.xml
)
except PageMissingError as pme:
xml = pme.xml
except ExportAbortedError:
pass
header = xml.split("</mediawiki>")[0]
if not re.match(r"\s*<mediawiki", xml):
@ -111,9 +116,7 @@ def getXMLHeader(config: Config, session: requests.Session) -> Tuple[str, Config
print(xml)
print("XML export on this wiki is broken, quitting.")
logerror(
config=config,
to_stdout=True,
text="XML export on this wiki is broken, quitting.",
to_stdout=True, text="XML export on this wiki is broken, quitting."
)
sys.exit()
return header, config

@ -1,10 +1,10 @@
from typing import Iterable
from typing import *
from wikiteam3.dumpgenerator.config import Config
def checkXMLIntegrity(
config: Config, titles: (Iterable[str] | None) = None, session=None
config: Config = None, titles: Iterable[str] = None, session=None
):
"""Check XML dump integrity, to detect broken XML chunks"""
# TODO: Fix XML Integrity Check

@ -1,9 +1,9 @@
import os
from io import StringIO
from typing import *
import lxml.etree
from file_read_backwards import FileReadBackwards
from lxml.etree import _ElementTree as ElementTree
def endsWithNewlines(filename: str) -> int:
@ -60,9 +60,10 @@ def truncateXMLDump(filename: str) -> str:
return incomplete_segment
def parseLastPageChunk(chunk) -> ElementTree:
parser = lxml.etree.XMLParser(recover=True)
tree = lxml.etree.parse(StringIO(chunk), parser)
return tree.getroot()
# except lxml.etree.LxmlError:
# return None
def parseLastPageChunk(chunk) -> Optional[lxml.etree._ElementTree]:
try:
parser = lxml.etree.XMLParser(recover=True)
tree = lxml.etree.parse(StringIO(chunk), parser)
return tree.getroot()
except lxml.etree.LxmlError:
return None

@ -3,7 +3,7 @@ import datetime
from wikiteam3.dumpgenerator.config import Config
def logerror(config: Config, to_stdout=False, text="") -> None:
def logerror(config: Config = None, to_stdout=False, text="") -> None:
"""Log error in errors.log"""
if text:
with open(f"{config.path}/errors.log", "a", encoding="utf-8") as outfile:

@ -25,7 +25,7 @@ def _new_config_from_parameter(params):
def get_config(mediawiki_ver, api=True):
assert api == True # type: ignore
assert api == True
if mediawiki_ver == "1.16.5":
return _new_config_from_parameter(
[
@ -33,4 +33,3 @@ def get_config(mediawiki_ver, api=True):
"http://group0.mediawiki.demo.save-web.org/mediawiki-1.16.5/api.php",
]
)
raise ValueError(f"Expected mediawiki_ver '1.16.5'; got {mediawiki_ver}")

@ -22,7 +22,7 @@ TODO:
* advanced: batch downloads, upload to Internet Archive or anywhere
"""
import contextlib
import os
import platform
import random
@ -129,7 +129,7 @@ class App:
self.button11 = Button(
self.labelframe11,
text="Check",
command=lambda: threading.start_new_threading(self.checkURL, ()), # type: ignore
command=lambda: threading.start_new_threading(self.checkURL, ()),
width=5,
)
self.button11.grid(row=0, column=3)
@ -275,14 +275,14 @@ class App:
self.button21 = Button(
self.frame2,
text="Load available dumps",
command=lambda: threading.start_new_threading(self.loadAvailableDumps, ()), # type: ignore
command=lambda: threading.start_new_threading(self.loadAvailableDumps, ()),
width=15,
)
self.button21.grid(row=3, column=0)
self.button23 = Button(
self.frame2,
text="Download selection",
command=lambda: threading.start_new_threading(self.downloadDump, ()), # type: ignore
command=lambda: threading.start_new_threading(self.downloadDump, ()),
width=15,
)
self.button23.grid(row=3, column=4)
@ -337,7 +337,7 @@ class App:
): # well-constructed URL?, one dot at least, aaaaa.com, but bb.aaaaa.com is allowed too
if self.optionmenu11var.get() == "api.php":
self.msg("Please wait... Checking api.php...")
if checkAPI(self.entry11.get(), None): # type: ignore
if checkAPI(self.entry11.get()):
self.entry11.config(background="lightgreen")
self.msg("api.php is correct!", level="ok")
else:
@ -345,7 +345,7 @@ class App:
self.msg("api.php is incorrect!", level="error")
elif self.optionmenu11var.get() == "index.php":
self.msg("Please wait... Checking index.php...")
if checkIndex(self.entry11.get(), None): # type: ignore
if checkIndex(self.entry11.get()):
self.entry11.config(background="lightgreen")
self.msg("index.php is OK!", level="ok")
else:
@ -374,7 +374,7 @@ class App:
def run(self):
for _ in range(10):
time.sleep(0.1)
self.value += 10 # type: ignore
self.value += 10
"""
#get parameters selected
@ -388,7 +388,7 @@ class App:
def msg(self, msg="", level=""):
levels = {"ok": "lightgreen", "warning": "yellow", "error": "red"}
if level.lower() in levels:
if levels.has_key(level.lower()):
print(f"{level.upper()}: {msg}")
self.status.config(
text=f"{level.upper()}: {msg}", background=levels[level.lower()]
@ -398,9 +398,9 @@ class App:
self.status.config(text=msg, background="grey")
def treeSortColumn(self, column, reverse=False):
line = [(self.tree.set(i, column), i) for i in self.tree.get_children("")]
line.sort(reverse=reverse)
for index, (val, i) in enumerate(line):
l = [(self.tree.set(i, column), i) for i in self.tree.get_children("")]
l.sort(reverse=reverse)
for index, (val, i) in enumerate(l):
self.tree.move(i, "", index)
self.tree.heading(
column,
@ -408,7 +408,7 @@ class App:
)
def downloadProgress(self, block_count, block_size, total_size):
with contextlib.suppress(Exception):
try:
total_mb = total_size / 1024 / 1024.0
downloaded = block_count * (block_size / 1024 / 1024.0)
percent = downloaded / (total_mb / 100.0)
@ -419,6 +419,8 @@ class App:
self.msg(msg, level="ok")
# sys.stdout.write("%.1f MB of %.1f MB downloaded (%.2f%%)" %(downloaded, total_mb, percent))
# sys.stdout.flush()
except:
pass
def downloadDump(self, event=None):
if self.block:
@ -450,7 +452,7 @@ class App:
self.dumps[int(item)][5],
)
)
urllib.urlretrieve( # type: ignore
f = urllib.urlretrieve(
self.dumps[int(item)][5],
filepath,
reporthook=self.downloadProgress,
@ -612,11 +614,11 @@ class App:
],
]
wikifarms_r = re.compile(f'({"|".join(wikifarms.keys())})')
# c = 0
c = 0
for mirror, url, regexp in self.urls:
print("Loading data from", mirror, url)
self.msg(msg=f"Please wait... Loading data from {mirror} {url}")
f = urllib.request.urlopen(url) # type: ignore
f = urllib.request.urlopen(url)
m = re.compile(regexp).finditer(f.read())
for i in m:
filename = i.group("filename")
@ -626,7 +628,9 @@ class App:
if re.search(wikifarms_r, filename):
wikifarm = re.findall(wikifarms_r, filename)[0]
wikifarm = wikifarms[wikifarm]
size = i.group("size") or "Unknown"
size = i.group("size")
if not size:
size = "Unknown"
date = "Unknown"
if re.search(r"\-(\d{8})[\.-]", filename):
date = re.findall(r"\-(\d{4})(\d{2})(\d{2})[\.-]", filename)[0]

@ -15,9 +15,12 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
import argparse
import getopt
import hashlib
import os
import re
import shutil
import subprocess
import time
import urllib.parse
from io import BytesIO
@ -92,7 +95,6 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]):
prefix = domain2prefix(Config(api=wiki))
except KeyError:
print("ERROR: could not produce the prefix for %s" % wiki)
continue
wikiname = prefix.split("-")[0]
dumps = []
@ -161,29 +163,29 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]):
r = requests.get(url=wiki, params=params, headers=headers)
if r.status_code < 400:
xml = r.text
except requests.exceptions.ConnectionError:
except requests.exceptions.ConnectionError as e:
pass
sitename = ""
baseurl = ""
lang = ""
try:
sitename = re.findall(r"sitename=\"([^\"]+)\"", xml)[0] # type: ignore
except Exception:
sitename = re.findall(r"sitename=\"([^\"]+)\"", xml)[0]
except:
pass
try:
baseurl = re.findall(r"base=\"([^\"]+)\"", xml)[0] # type: ignore
except Exception:
baseurl = re.findall(r"base=\"([^\"]+)\"", xml)[0]
except:
pass
try:
lang = re.findall(r"lang=\"([^\"]+)\"", xml)[0] # type: ignore
except Exception:
lang = re.findall(r"lang=\"([^\"]+)\"", xml)[0]
except:
pass
if not sitename:
sitename = wikiname
if not baseurl:
baseurl = re.sub(r"(?im)/api\.php", r"", wiki) # type: ignore
baseurl = re.sub(r"(?im)/api\.php", r"", wiki)
# Convert protocol-relative URLs
baseurl = re.sub("^//", "https://", baseurl)
if lang:
@ -205,7 +207,7 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]):
r = requests.get(url=wiki, params=params, headers=headers)
if r.status_code < 400:
xml = r.text
except requests.exceptions.ConnectionError:
except requests.exceptions.ConnectionError as e:
pass
rightsinfourl = ""
@ -213,7 +215,7 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]):
try:
rightsinfourl = re.findall(r"rightsinfo url=\"([^\"]+)\"", xml)[0]
rightsinfotext = re.findall(r"text=\"([^\"]+)\"", xml)[0]
except Exception:
except:
pass
raw = ""
@ -221,7 +223,7 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]):
r = requests.get(url=baseurl, headers=headers)
if r.status_code < 400:
raw = r.text
except requests.exceptions.ConnectionError:
except requests.exceptions.ConnectionError as e:
pass
# or copyright info from #footer in mainpage
@ -233,13 +235,13 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]):
rightsinfourl = re.findall(
r"<link rel=\"copyright\" href=\"([^\"]+)\" />", raw
)[0]
except Exception:
except:
pass
try:
rightsinfotext = re.findall(
r"<li id=\"copyright\">([^\n\r]*?)</li>", raw
)[0]
except Exception:
except:
pass
if rightsinfotext and not rightsinfourl:
rightsinfourl = baseurl + "#footer"
@ -258,7 +260,7 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]):
if "http" not in logourl:
# Probably a relative path, construct the absolute path
logourl = urllib.parse.urljoin(wiki, logourl)
except Exception:
except:
pass
# retrieve some info from the wiki
@ -321,7 +323,7 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]):
try:
item.upload(
str(dump),
metadata=md, # type: ignore
metadata=md,
access_key=ia_keys["access"],
secret_key=ia_keys["secret"],
verbose=True,
@ -339,14 +341,12 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]):
# Update metadata
r = item.modify_metadata(
md, # type: ignore
access_key=ia_keys["access"],
secret_key=ia_keys["secret"],
md, access_key=ia_keys["access"], secret_key=ia_keys["secret"]
)
if r.status_code != 200: # type: ignore
if r.status_code != 200:
print("Error when updating metadata")
print(r.status_code) # type: ignore
print(r.text) # type: ignore
print(r.status_code)
print(r.text)
print(
"You can find it in https://archive.org/details/%s" % (identifier)
@ -358,11 +358,11 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]):
try:
log(logfile, wiki, dump, "ok")
if logourl:
logo = BytesIO(requests.get(logourl, timeout=10).content) # type: ignore
logo = BytesIO(requests.get(logourl, timeout=10).content)
if ".png" in logourl:
logoextension = "png"
elif logourl.split("."): # type: ignore
logoextension = logourl.split(".")[-1] # type: ignore
elif logourl.split("."):
logoextension = logourl.split(".")[-1]
else:
logoextension = "unknown"
logoname = "wiki-" + wikiname + "_logo." + logoextension
@ -410,11 +410,11 @@ Use --help to print this help."""
listfile = config.listfile
try:
uploadeddumps = [
line.split(";")[1]
for line in open("uploader-%s.log" % (listfile)).read().strip().splitlines()
if len(line.split(";")) > 1
l.split(";")[1]
for l in open("uploader-%s.log" % (listfile)).read().strip().splitlines()
if len(l.split(";")) > 1
]
except Exception:
except:
pass
if config.logfile is None:

@ -1,9 +1,7 @@
from .domain import domain2prefix
from .login import botLogin, clientLogin, indexLogin, uniLogin
from .login import botLogin, clientLogin, fetchLoginToken, indexLogin, uniLogin
from .monkey_patch import mod_requests_text
from .uprint import uprint
from .user_agent import getUserAgent
from .util import cleanHTML, cleanXML, removeIP, sha1File, undoHTMLEntities
from .wiki_avoid import avoidWikimediaProjects
__all__ = [domain2prefix, botLogin, clientLogin, indexLogin, uniLogin, mod_requests_text, uprint, getUserAgent, cleanHTML, cleanXML, removeIP, sha1File, undoHTMLEntities, avoidWikimediaProjects] # type: ignore

@ -3,7 +3,7 @@ import re
from wikiteam3.dumpgenerator.config import Config
def domain2prefix(config: Config):
def domain2prefix(config: Config = None, session=None):
"""Convert domain name to a valid prefix filename."""
# At this point, both api and index are supposed to be defined

@ -4,7 +4,7 @@ import time
import requests
from wikiteam3.utils.login.api import botLogin, clientLogin
from wikiteam3.utils.login.api import botLogin, clientLogin, fetchLoginToken
from wikiteam3.utils.login.index import indexLogin

@ -1,6 +1,6 @@
""" Available since MediaWiki 1.27. login to a wiki using username and password (API) """
from typing import Optional
from typing import *
import requests
@ -15,7 +15,8 @@ def fetchLoginToken(session: requests.Session, api: str) -> Optional[str]:
data = response.json()
try:
token = data["query"]["tokens"]["logintoken"]
return token if type(token) is str else None
if type(token) is str:
return token
except KeyError:
print("fetch login token: Oops! Something went wrong -- ", data)
return None

@ -1,7 +1,7 @@
""" Always available login methods.(mw 1.16-1.39)
Even oler versions of MW may work, but not tested. """
from typing import Optional
from typing import *
import lxml.html
import requests
@ -45,7 +45,7 @@ def indexLogin(
"title": "Special:UserLogin", # introduced before MW 1.39.
"force": "", # introduced before MW 1.39, empty string is OK.
}
r = session.post(index, allow_redirects=False, params=params, data=data) # type: ignore
r = session.post(index, allow_redirects=False, params=params, data=data)
if r.status_code == 302:
print("index login: Success! Welcome, ", username, "!")
return session

@ -3,13 +3,13 @@ import requests
from wikiteam3.dumpgenerator.cli.delay import Delay
def mod_requests_text(requests: requests): # type: ignore
def mod_requests_text(requests: requests):
"""Monkey patch `requests.Response.text` to remove BOM"""
def new_text(self):
return self.content.lstrip(b"\xef\xbb\xbf").decode(self.encoding)
requests.Response.text = property(new_text) # type: ignore
requests.Response.text = property(new_text)
class DelaySession:
@ -26,8 +26,8 @@ class DelaySession:
"""Don't forget to call `release()`"""
def new_send(request, **kwargs):
Delay(msg=self.msg, delay=self.delay, config=self.config) # type: ignore
return self.old_send(request, **kwargs) # type: ignore
Delay(msg=self.msg, delay=self.delay, config=self.config)
return self.old_send(request, **kwargs)
self.old_send = self.session.send
self.session.send = new_send

@ -319,10 +319,10 @@ def getUserAgent():
def setupUserAgent(session: requests.Session):
session._orirequest = session.request # type: ignore
session._orirequest = session.request
def newrequest(*args, **kwargs):
session.headers.update({"User-Agent": getUserAgent()})
return session._orirequest(*args, **kwargs) # type: ignore
return session._orirequest(*args, **kwargs)
session.request = newrequest # type: ignore
session.request = newrequest

@ -1,11 +1,11 @@
import re
import sys
from typing import Dict
from typing import *
from wikiteam3.dumpgenerator.config import Config
def avoidWikimediaProjects(config: Config, other: Dict):
def avoidWikimediaProjects(config: Config = None, other: Dict = None):
"""Skip Wikimedia projects and redirect to the dumps website"""
# notice about wikipedia dumps

Loading…
Cancel
Save