diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 571cf10..7ef5649 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,7 +9,7 @@ repos: rev: 1.6.0 hooks: - id: poetry-check - - id: poetry-lock + # - id: poetry-lock - id: poetry-export args: ["-f", "requirements.txt", "-o", "requirements.txt"] - repo: https://github.com/pre-commit/pre-commit-hooks diff --git a/poetry.lock b/poetry.lock index 23b1d1e..0ee98d7 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,9 +1,10 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "atomicwrites" version = "1.4.1" description = "Atomic file writes." +category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -14,6 +15,7 @@ files = [ name = "attrs" version = "23.1.0" description = "Classes Without Boilerplate" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -28,56 +30,11 @@ docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib- tests = ["attrs[tests-no-zope]", "zope-interface"] tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"] -[[package]] -name = "black" -version = "23.7.0" -description = "The uncompromising code formatter." -optional = false -python-versions = ">=3.8" -files = [ - {file = "black-23.7.0-cp310-cp310-macosx_10_16_arm64.whl", hash = "sha256:5c4bc552ab52f6c1c506ccae05681fab58c3f72d59ae6e6639e8885e94fe2587"}, - {file = "black-23.7.0-cp310-cp310-macosx_10_16_universal2.whl", hash = "sha256:552513d5cd5694590d7ef6f46e1767a4df9af168d449ff767b13b084c020e63f"}, - {file = "black-23.7.0-cp310-cp310-macosx_10_16_x86_64.whl", hash = "sha256:86cee259349b4448adb4ef9b204bb4467aae74a386bce85d56ba4f5dc0da27be"}, - {file = "black-23.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:501387a9edcb75d7ae8a4412bb8749900386eaef258f1aefab18adddea1936bc"}, - {file = "black-23.7.0-cp310-cp310-win_amd64.whl", hash = "sha256:fb074d8b213749fa1d077d630db0d5f8cc3b2ae63587ad4116e8a436e9bbe995"}, - {file = "black-23.7.0-cp311-cp311-macosx_10_16_arm64.whl", hash = "sha256:b5b0ee6d96b345a8b420100b7d71ebfdd19fab5e8301aff48ec270042cd40ac2"}, - {file = "black-23.7.0-cp311-cp311-macosx_10_16_universal2.whl", hash = "sha256:893695a76b140881531062d48476ebe4a48f5d1e9388177e175d76234ca247cd"}, - {file = "black-23.7.0-cp311-cp311-macosx_10_16_x86_64.whl", hash = "sha256:c333286dc3ddca6fdff74670b911cccedacb4ef0a60b34e491b8a67c833b343a"}, - {file = "black-23.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:831d8f54c3a8c8cf55f64d0422ee875eecac26f5f649fb6c1df65316b67c8926"}, - {file = "black-23.7.0-cp311-cp311-win_amd64.whl", hash = "sha256:7f3bf2dec7d541b4619b8ce526bda74a6b0bffc480a163fed32eb8b3c9aed8ad"}, - {file = "black-23.7.0-cp38-cp38-macosx_10_16_arm64.whl", hash = "sha256:f9062af71c59c004cd519e2fb8f5d25d39e46d3af011b41ab43b9c74e27e236f"}, - {file = "black-23.7.0-cp38-cp38-macosx_10_16_universal2.whl", hash = "sha256:01ede61aac8c154b55f35301fac3e730baf0c9cf8120f65a9cd61a81cfb4a0c3"}, - {file = "black-23.7.0-cp38-cp38-macosx_10_16_x86_64.whl", hash = "sha256:327a8c2550ddc573b51e2c352adb88143464bb9d92c10416feb86b0f5aee5ff6"}, - {file = "black-23.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6d1c6022b86f83b632d06f2b02774134def5d4d4f1dac8bef16d90cda18ba28a"}, - {file = "black-23.7.0-cp38-cp38-win_amd64.whl", hash = "sha256:27eb7a0c71604d5de083757fbdb245b1a4fae60e9596514c6ec497eb63f95320"}, - {file = "black-23.7.0-cp39-cp39-macosx_10_16_arm64.whl", hash = "sha256:8417dbd2f57b5701492cd46edcecc4f9208dc75529bcf76c514864e48da867d9"}, - {file = "black-23.7.0-cp39-cp39-macosx_10_16_universal2.whl", hash = "sha256:47e56d83aad53ca140da0af87678fb38e44fd6bc0af71eebab2d1f59b1acf1d3"}, - {file = "black-23.7.0-cp39-cp39-macosx_10_16_x86_64.whl", hash = "sha256:25cc308838fe71f7065df53aedd20327969d05671bac95b38fdf37ebe70ac087"}, - {file = "black-23.7.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:642496b675095d423f9b8448243336f8ec71c9d4d57ec17bf795b67f08132a91"}, - {file = "black-23.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:ad0014efc7acf0bd745792bd0d8857413652979200ab924fbf239062adc12491"}, - {file = "black-23.7.0-py3-none-any.whl", hash = "sha256:9fd59d418c60c0348505f2ddf9609c1e1de8e7493eab96198fc89d9f865e7a96"}, - {file = "black-23.7.0.tar.gz", hash = "sha256:022a582720b0d9480ed82576c920a8c1dde97cc38ff11d8d8859b3bd6ca9eedb"}, -] - -[package.dependencies] -click = ">=8.0.0" -mypy-extensions = ">=0.4.3" -packaging = ">=22.0" -pathspec = ">=0.9.0" -platformdirs = ">=2" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = {version = ">=3.10.0.0", markers = "python_version < \"3.10\""} - -[package.extras] -colorama = ["colorama (>=0.4.3)"] -d = ["aiohttp (>=3.7.4)"] -jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] -uvloop = ["uvloop (>=0.15.2)"] - [[package]] name = "certifi" version = "2023.7.22" description = "Python package for providing Mozilla's CA Bundle." +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -89,6 +46,7 @@ files = [ name = "cfgv" version = "3.4.0" description = "Validate configuration and produce human readable error messages." +category = "dev" optional = false python-versions = ">=3.8" files = [ @@ -100,6 +58,7 @@ files = [ name = "charset-normalizer" version = "3.2.0" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." +category = "main" optional = false python-versions = ">=3.7.0" files = [ @@ -180,24 +139,11 @@ files = [ {file = "charset_normalizer-3.2.0-py3-none-any.whl", hash = "sha256:8e098148dd37b4ce3baca71fb394c81dc5d9c7728c95df695d2dca218edf40e6"}, ] -[[package]] -name = "click" -version = "8.1.7" -description = "Composable command line interface toolkit" -optional = false -python-versions = ">=3.7" -files = [ - {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, - {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, -] - -[package.dependencies] -colorama = {version = "*", markers = "platform_system == \"Windows\""} - [[package]] name = "colorama" version = "0.4.6" description = "Cross-platform colored terminal text." +category = "main" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" files = [ @@ -209,6 +155,7 @@ files = [ name = "contextlib2" version = "21.6.0" description = "Backports and enhancements for the contextlib module" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -220,6 +167,7 @@ files = [ name = "distlib" version = "0.3.7" description = "Distribution utilities" +category = "dev" optional = false python-versions = "*" files = [ @@ -231,6 +179,7 @@ files = [ name = "docopt" version = "0.6.2" description = "Pythonic argument parser, that will make you smile" +category = "main" optional = false python-versions = "*" files = [ @@ -241,6 +190,7 @@ files = [ name = "file-read-backwards" version = "2.0.0" description = "Memory efficient way of reading files line-by-line from the end of file" +category = "main" optional = false python-versions = "*" files = [ @@ -250,26 +200,25 @@ files = [ [[package]] name = "filelock" -version = "3.12.3" +version = "3.12.2" description = "A platform independent file lock." +category = "dev" optional = false -python-versions = ">=3.8" +python-versions = ">=3.7" files = [ - {file = "filelock-3.12.3-py3-none-any.whl", hash = "sha256:f067e40ccc40f2b48395a80fcbd4728262fab54e232e090a4063ab804179efeb"}, - {file = "filelock-3.12.3.tar.gz", hash = "sha256:0ecc1dd2ec4672a10c8550a8182f1bd0c0a5088470ecd5a125e45f49472fac3d"}, + {file = "filelock-3.12.2-py3-none-any.whl", hash = "sha256:cbb791cdea2a72f23da6ac5b5269ab0a0d161e9ef0100e653b69049a7706d1ec"}, + {file = "filelock-3.12.2.tar.gz", hash = "sha256:002740518d8aa59a26b0c76e10fb8c6e15eae825d34b6fdf670333fd7b938d81"}, ] -[package.dependencies] -typing-extensions = {version = ">=4.7.1", markers = "python_version < \"3.11\""} - [package.extras] -docs = ["furo (>=2023.7.26)", "sphinx (>=7.1.2)", "sphinx-autodoc-typehints (>=1.24)"] -testing = ["covdefaults (>=2.3)", "coverage (>=7.3)", "diff-cover (>=7.7)", "pytest (>=7.4)", "pytest-cov (>=4.1)", "pytest-mock (>=3.11.1)", "pytest-timeout (>=2.1)"] +docs = ["furo (>=2023.5.20)", "sphinx (>=7.0.1)", "sphinx-autodoc-typehints (>=1.23,!=1.23.4)"] +testing = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "diff-cover (>=7.5)", "pytest (>=7.3.1)", "pytest-cov (>=4.1)", "pytest-mock (>=3.10)", "pytest-timeout (>=2.1)"] [[package]] name = "flake8" version = "3.9.2" description = "the modular source code checker: pep8 pyflakes and co" +category = "dev" optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,>=2.7" files = [ @@ -282,34 +231,16 @@ mccabe = ">=0.6.0,<0.7.0" pycodestyle = ">=2.7.0,<2.8.0" pyflakes = ">=2.3.0,<2.4.0" -[[package]] -name = "flake8-black" -version = "0.3.6" -description = "flake8 plugin to call black as a code style validator" -optional = false -python-versions = ">=3.7" -files = [ - {file = "flake8-black-0.3.6.tar.gz", hash = "sha256:0dfbca3274777792a5bcb2af887a4cad72c72d0e86c94e08e3a3de151bb41c34"}, - {file = "flake8_black-0.3.6-py3-none-any.whl", hash = "sha256:fe8ea2eca98d8a504f22040d9117347f6b367458366952862ac3586e7d4eeaca"}, -] - -[package.dependencies] -black = ">=22.1.0" -flake8 = ">=3" -tomli = {version = "*", markers = "python_version < \"3.11\""} - -[package.extras] -develop = ["build", "twine"] - [[package]] name = "identify" -version = "2.5.27" +version = "2.5.26" description = "File identification library for Python" +category = "dev" optional = false python-versions = ">=3.8" files = [ - {file = "identify-2.5.27-py2.py3-none-any.whl", hash = "sha256:fdb527b2dfe24602809b2201e033c2a113d7bdf716db3ca8e3243f735dcecaba"}, - {file = "identify-2.5.27.tar.gz", hash = "sha256:287b75b04a0e22d727bc9a41f0d4f3c1bcada97490fa6eabb5b28f0e9097e733"}, + {file = "identify-2.5.26-py2.py3-none-any.whl", hash = "sha256:c22a8ead0d4ca11f1edd6c9418c3220669b3b7533ada0a0ffa6cc0ef85cf9b54"}, + {file = "identify-2.5.26.tar.gz", hash = "sha256:7243800bce2f58404ed41b7c002e53d4d22bcf3ae1b7900c2d7aefd95394bf7f"}, ] [package.extras] @@ -319,6 +250,7 @@ license = ["ukkonen"] name = "idna" version = "3.4" description = "Internationalized Domain Names in Applications (IDNA)" +category = "main" optional = false python-versions = ">=3.5" files = [ @@ -330,6 +262,7 @@ files = [ name = "iniconfig" version = "2.0.0" description = "brain-dead simple config-ini parsing" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -341,6 +274,7 @@ files = [ name = "internetarchive" version = "3.5.0" description = "A Python interface to archive.org." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -366,6 +300,7 @@ types = ["tqdm-stubs (>=0.2.0)", "types-colorama", "types-docopt (>=0.6.10,<0.7. name = "jsonpatch" version = "1.33" description = "Apply JSON-Patches (RFC 6902)" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" files = [ @@ -380,6 +315,7 @@ jsonpointer = ">=1.9" name = "jsonpointer" version = "2.4" description = "Identify specific nodes in a JSON document (RFC 6901)" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" files = [ @@ -391,6 +327,7 @@ files = [ name = "lxml" version = "4.9.3" description = "Powerful and Pythonic XML processing library combining libxml2/libxslt with the ElementTree API." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, != 3.4.*" files = [ @@ -498,6 +435,7 @@ source = ["Cython (>=0.29.35)"] name = "mccabe" version = "0.6.1" description = "McCabe checker, plugin for flake8" +category = "dev" optional = false python-versions = "*" files = [ @@ -509,6 +447,7 @@ files = [ name = "mwclient" version = "0.10.1" description = "MediaWiki API client" +category = "main" optional = false python-versions = "*" files = [ @@ -520,67 +459,11 @@ files = [ requests-oauthlib = "*" six = "*" -[[package]] -name = "mypy" -version = "1.5.1" -description = "Optional static typing for Python" -optional = false -python-versions = ">=3.8" -files = [ - {file = "mypy-1.5.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:f33592ddf9655a4894aef22d134de7393e95fcbdc2d15c1ab65828eee5c66c70"}, - {file = "mypy-1.5.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:258b22210a4a258ccd077426c7a181d789d1121aca6db73a83f79372f5569ae0"}, - {file = "mypy-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9ec1f695f0c25986e6f7f8778e5ce61659063268836a38c951200c57479cc12"}, - {file = "mypy-1.5.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:abed92d9c8f08643c7d831300b739562b0a6c9fcb028d211134fc9ab20ccad5d"}, - {file = "mypy-1.5.1-cp310-cp310-win_amd64.whl", hash = "sha256:a156e6390944c265eb56afa67c74c0636f10283429171018446b732f1a05af25"}, - {file = "mypy-1.5.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6ac9c21bfe7bc9f7f1b6fae441746e6a106e48fc9de530dea29e8cd37a2c0cc4"}, - {file = "mypy-1.5.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:51cb1323064b1099e177098cb939eab2da42fea5d818d40113957ec954fc85f4"}, - {file = "mypy-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:596fae69f2bfcb7305808c75c00f81fe2829b6236eadda536f00610ac5ec2243"}, - {file = "mypy-1.5.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:32cb59609b0534f0bd67faebb6e022fe534bdb0e2ecab4290d683d248be1b275"}, - {file = "mypy-1.5.1-cp311-cp311-win_amd64.whl", hash = "sha256:159aa9acb16086b79bbb0016145034a1a05360626046a929f84579ce1666b315"}, - {file = "mypy-1.5.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f6b0e77db9ff4fda74de7df13f30016a0a663928d669c9f2c057048ba44f09bb"}, - {file = "mypy-1.5.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:26f71b535dfc158a71264e6dc805a9f8d2e60b67215ca0bfa26e2e1aa4d4d373"}, - {file = "mypy-1.5.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2fc3a600f749b1008cc75e02b6fb3d4db8dbcca2d733030fe7a3b3502902f161"}, - {file = "mypy-1.5.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:26fb32e4d4afa205b24bf645eddfbb36a1e17e995c5c99d6d00edb24b693406a"}, - {file = "mypy-1.5.1-cp312-cp312-win_amd64.whl", hash = "sha256:82cb6193de9bbb3844bab4c7cf80e6227d5225cc7625b068a06d005d861ad5f1"}, - {file = "mypy-1.5.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:4a465ea2ca12804d5b34bb056be3a29dc47aea5973b892d0417c6a10a40b2d65"}, - {file = "mypy-1.5.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:9fece120dbb041771a63eb95e4896791386fe287fefb2837258925b8326d6160"}, - {file = "mypy-1.5.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d28ddc3e3dfeab553e743e532fb95b4e6afad51d4706dd22f28e1e5e664828d2"}, - {file = "mypy-1.5.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:57b10c56016adce71fba6bc6e9fd45d8083f74361f629390c556738565af8eeb"}, - {file = "mypy-1.5.1-cp38-cp38-win_amd64.whl", hash = "sha256:ff0cedc84184115202475bbb46dd99f8dcb87fe24d5d0ddfc0fe6b8575c88d2f"}, - {file = "mypy-1.5.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:8f772942d372c8cbac575be99f9cc9d9fb3bd95c8bc2de6c01411e2c84ebca8a"}, - {file = "mypy-1.5.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:5d627124700b92b6bbaa99f27cbe615c8ea7b3402960f6372ea7d65faf376c14"}, - {file = "mypy-1.5.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:361da43c4f5a96173220eb53340ace68cda81845cd88218f8862dfb0adc8cddb"}, - {file = "mypy-1.5.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:330857f9507c24de5c5724235e66858f8364a0693894342485e543f5b07c8693"}, - {file = "mypy-1.5.1-cp39-cp39-win_amd64.whl", hash = "sha256:c543214ffdd422623e9fedd0869166c2f16affe4ba37463975043ef7d2ea8770"}, - {file = "mypy-1.5.1-py3-none-any.whl", hash = "sha256:f757063a83970d67c444f6e01d9550a7402322af3557ce7630d3c957386fa8f5"}, - {file = "mypy-1.5.1.tar.gz", hash = "sha256:b031b9601f1060bf1281feab89697324726ba0c0bae9d7cd7ab4b690940f0b92"}, -] - -[package.dependencies] -mypy-extensions = ">=1.0.0" -tomli = {version = ">=1.1.0", markers = "python_version < \"3.11\""} -typing-extensions = ">=4.1.0" - -[package.extras] -dmypy = ["psutil (>=4.0)"] -install-types = ["pip"] -reports = ["lxml"] - -[[package]] -name = "mypy-extensions" -version = "1.0.0" -description = "Type system extensions for programs checked with the mypy type checker." -optional = false -python-versions = ">=3.5" -files = [ - {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, - {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, -] - [[package]] name = "nodeenv" version = "1.8.0" description = "Node.js virtual environment builder" +category = "dev" optional = false python-versions = ">=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*" files = [ @@ -595,6 +478,7 @@ setuptools = "*" name = "oauthlib" version = "3.2.2" description = "A generic, spec-compliant, thorough implementation of the OAuth request-signing logic" +category = "main" optional = false python-versions = ">=3.6" files = [ @@ -611,6 +495,7 @@ signedtoken = ["cryptography (>=3.0.0)", "pyjwt (>=2.0.0,<3)"] name = "packaging" version = "23.1" description = "Core utilities for Python packages" +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -618,21 +503,11 @@ files = [ {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] -[[package]] -name = "pathspec" -version = "0.11.2" -description = "Utility library for gitignore style pattern matching of file paths." -optional = false -python-versions = ">=3.7" -files = [ - {file = "pathspec-0.11.2-py3-none-any.whl", hash = "sha256:1d6ed233af05e679efb96b1851550ea95bbb64b7c490b0f5aa52996c11e92a20"}, - {file = "pathspec-0.11.2.tar.gz", hash = "sha256:e0d8d0ac2f12da61956eb2306b69f9469b42f4deb0f3cb6ed47b9cce9996ced3"}, -] - [[package]] name = "platformdirs" version = "3.10.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -646,13 +521,14 @@ test = ["appdirs (==1.4.4)", "covdefaults (>=2.3)", "pytest (>=7.4)", "pytest-co [[package]] name = "pluggy" -version = "1.3.0" +version = "1.2.0" description = "plugin and hook calling mechanisms for python" +category = "dev" optional = false -python-versions = ">=3.8" +python-versions = ">=3.7" files = [ - {file = "pluggy-1.3.0-py3-none-any.whl", hash = "sha256:d89c696a773f8bd377d18e5ecda92b7a3793cbe66c87060a6fb58c7b6e1061f7"}, - {file = "pluggy-1.3.0.tar.gz", hash = "sha256:cf61ae8f126ac6f7c451172cf30e3e43d3ca77615509771b3a984a0730651e12"}, + {file = "pluggy-1.2.0-py3-none-any.whl", hash = "sha256:c2fd55a7d7a3863cba1a013e4e2414658b1d07b6bc57b3919e0c63c9abb99849"}, + {file = "pluggy-1.2.0.tar.gz", hash = "sha256:d12f0c4b579b15f5e054301bb226ee85eeeba08ffec228092f8defbaa3a4c4b3"}, ] [package.extras] @@ -663,6 +539,7 @@ testing = ["pytest", "pytest-benchmark"] name = "poster3" version = "0.8.1" description = "Streaming HTTP uploads and multipart/form-data encoding" +category = "main" optional = false python-versions = "*" files = [ @@ -676,6 +553,7 @@ poster3 = ["buildutils", "sphinx"] name = "pre-commit" version = "2.21.0" description = "A framework for managing and maintaining multi-language pre-commit hooks." +category = "dev" optional = false python-versions = ">=3.7" files = [ @@ -694,6 +572,7 @@ virtualenv = ">=20.10.0" name = "pre-commit-poetry-export" version = "0.1.2" description = "pre-commit hook to keep requirements.txt updated" +category = "main" optional = false python-versions = ">=3.8,<4.0" files = [ @@ -705,6 +584,7 @@ files = [ name = "py" version = "1.11.0" description = "library with cross-python path, ini-parsing, io, code, log facilities" +category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*" files = [ @@ -716,6 +596,7 @@ files = [ name = "pycodestyle" version = "2.7.0" description = "Python style guide checker" +category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -727,6 +608,7 @@ files = [ name = "pyflakes" version = "2.3.1" description = "passive checker of Python programs" +category = "dev" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -738,6 +620,7 @@ files = [ name = "pymarkdown" version = "0.1.4" description = "Evaluate code in markdown" +category = "dev" optional = false python-versions = "*" files = [ @@ -751,6 +634,7 @@ toolz = "*" name = "pymysql" version = "1.1.0" description = "Pure Python MySQL Driver" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -766,6 +650,7 @@ rsa = ["cryptography"] name = "pytest" version = "6.2.5" description = "pytest: simple powerful testing with Python" +category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -790,6 +675,7 @@ testing = ["argcomplete", "hypothesis (>=3.56)", "mock", "nose", "requests", "xm name = "pywikibot" version = "6.6.5" description = "Python MediaWiki Bot Framework" +category = "main" optional = false python-versions = ">=3.5.0" files = [ @@ -826,6 +712,7 @@ wikitextparser = ["wikitextparser (>=0.47.0)", "wikitextparser (>=0.47.5)"] name = "pyyaml" version = "6.0.1" description = "YAML parser and emitter for Python" +category = "dev" optional = false python-versions = ">=3.6" files = [ @@ -834,7 +721,6 @@ files = [ {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, - {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"}, {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, @@ -842,15 +728,8 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, - {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"}, {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, - {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, - {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, - {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, - {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"}, {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, @@ -867,7 +746,6 @@ files = [ {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, - {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"}, {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, @@ -875,7 +753,6 @@ files = [ {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, - {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"}, {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, @@ -885,6 +762,7 @@ files = [ name = "requests" version = "2.31.0" description = "Python HTTP for Humans." +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -906,6 +784,7 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] name = "requests-oauthlib" version = "1.3.1" description = "OAuthlib authentication support for Requests." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*" files = [ @@ -924,6 +803,7 @@ rsa = ["oauthlib[signedtoken] (>=3.0.0)"] name = "schema" version = "0.7.5" description = "Simple data validation library" +category = "main" optional = false python-versions = "*" files = [ @@ -938,6 +818,7 @@ contextlib2 = ">=0.5.5" name = "setuptools" version = "68.1.2" description = "Easily download, build, install, upgrade, and uninstall Python packages" +category = "main" optional = false python-versions = ">=3.8" files = [ @@ -954,6 +835,7 @@ testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs ( name = "six" version = "1.16.0" description = "Python 2 and 3 compatibility utilities" +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -965,6 +847,7 @@ files = [ name = "toml" version = "0.10.2" description = "Python Library for Tom's Obvious, Minimal Language" +category = "dev" optional = false python-versions = ">=2.6, !=3.0.*, !=3.1.*, !=3.2.*" files = [ @@ -972,21 +855,11 @@ files = [ {file = "toml-0.10.2.tar.gz", hash = "sha256:b3bda1d108d5dd99f4a20d24d9c348e91c4db7ab1b749200bded2f839ccbe68f"}, ] -[[package]] -name = "tomli" -version = "2.0.1" -description = "A lil' TOML parser" -optional = false -python-versions = ">=3.7" -files = [ - {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, - {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, -] - [[package]] name = "toolz" version = "0.12.0" description = "List processing tools and functional utilities" +category = "dev" optional = false python-versions = ">=3.5" files = [ @@ -998,6 +871,7 @@ files = [ name = "tqdm" version = "4.66.1" description = "Fast, Extensible Progress Meter" +category = "main" optional = false python-versions = ">=3.7" files = [ @@ -1014,46 +888,11 @@ notebook = ["ipywidgets (>=6)"] slack = ["slack-sdk"] telegram = ["requests"] -[[package]] -name = "types-requests" -version = "2.31.0.2" -description = "Typing stubs for requests" -optional = false -python-versions = "*" -files = [ - {file = "types-requests-2.31.0.2.tar.gz", hash = "sha256:6aa3f7faf0ea52d728bb18c0a0d1522d9bfd8c72d26ff6f61bfc3d06a411cf40"}, - {file = "types_requests-2.31.0.2-py3-none-any.whl", hash = "sha256:56d181c85b5925cbc59f4489a57e72a8b2166f18273fd8ba7b6fe0c0b986f12a"}, -] - -[package.dependencies] -types-urllib3 = "*" - -[[package]] -name = "types-urllib3" -version = "1.26.25.14" -description = "Typing stubs for urllib3" -optional = false -python-versions = "*" -files = [ - {file = "types-urllib3-1.26.25.14.tar.gz", hash = "sha256:229b7f577c951b8c1b92c1bc2b2fdb0b49847bd2af6d1cc2a2e3dd340f3bda8f"}, - {file = "types_urllib3-1.26.25.14-py3-none-any.whl", hash = "sha256:9683bbb7fb72e32bfe9d2be6e04875fbe1b3eeec3cbb4ea231435aa7fd6b4f0e"}, -] - -[[package]] -name = "typing-extensions" -version = "4.7.1" -description = "Backported and Experimental Type Hints for Python 3.7+" -optional = false -python-versions = ">=3.7" -files = [ - {file = "typing_extensions-4.7.1-py3-none-any.whl", hash = "sha256:440d5dd3af93b060174bf433bccd69b0babc3b15b1a8dca43789fd7f61514b36"}, - {file = "typing_extensions-4.7.1.tar.gz", hash = "sha256:b75ddc264f0ba5615db7ba217daeb99701ad295353c45f9e95963337ceeeffb2"}, -] - [[package]] name = "urllib3" version = "1.26.16" description = "HTTP library with thread-safe connection pooling, file post, and more." +category = "main" optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" files = [ @@ -1068,13 +907,14 @@ socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"] [[package]] name = "virtualenv" -version = "20.24.4" +version = "20.24.3" description = "Virtual Python Environment builder" +category = "dev" optional = false python-versions = ">=3.7" files = [ - {file = "virtualenv-20.24.4-py3-none-any.whl", hash = "sha256:29c70bb9b88510f6414ac3e55c8b413a1f96239b6b789ca123437d5e892190cb"}, - {file = "virtualenv-20.24.4.tar.gz", hash = "sha256:772b05bfda7ed3b8ecd16021ca9716273ad9f4467c801f27e83ac73430246dca"}, + {file = "virtualenv-20.24.3-py3-none-any.whl", hash = "sha256:95a6e9398b4967fbcb5fef2acec5efaf9aa4972049d9ae41f95e0972a683fd02"}, + {file = "virtualenv-20.24.3.tar.gz", hash = "sha256:e5c3b4ce817b0b328af041506a2a299418c98747c4b1e68cb7527e74ced23efc"}, ] [package.dependencies] @@ -1083,13 +923,14 @@ filelock = ">=3.12.2,<4" platformdirs = ">=3.9.1,<4" [package.extras] -docs = ["furo (>=2023.7.26)", "proselint (>=0.13)", "sphinx (>=7.1.2)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] +docs = ["furo (>=2023.5.20)", "proselint (>=0.13)", "sphinx (>=7.0.1)", "sphinx-argparse (>=0.4)", "sphinxcontrib-towncrier (>=0.2.1a0)", "towncrier (>=23.6)"] test = ["covdefaults (>=2.3)", "coverage (>=7.2.7)", "coverage-enable-subprocess (>=1)", "flaky (>=3.7)", "packaging (>=23.1)", "pytest (>=7.4)", "pytest-env (>=0.8.2)", "pytest-freezer (>=0.4.8)", "pytest-mock (>=3.11.1)", "pytest-randomly (>=3.12)", "pytest-timeout (>=2.1)", "setuptools (>=68)", "time-machine (>=2.10)"] [[package]] name = "wikitools3" version = "3.0.1" description = "Python package for interacting with a MediaWiki wiki. It is used by WikiTeam for archiving MediaWiki wikis." +category = "main" optional = false python-versions = ">=3.8,<4.0" files = [ @@ -1103,4 +944,4 @@ poster3 = ">=0.8.1,<0.9.0" [metadata] lock-version = "2.0" python-versions = "^3.8" -content-hash = "ebed56288c755209a5da1b75673fdda769a85b22d5f1c26fcb7492d971ffd617" +content-hash = "1eee6035c5660e8cba28942140937e2ceb36bf90482e76fa5ddd054efa3c659c" diff --git a/pyproject.toml b/pyproject.toml index 040dbc5..8453bae 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,10 +77,6 @@ requests = "^2.31.0" flake8 = "^3.9.2" pre-commit = "^2.17.0" pymarkdown = "^0.1.4" -mypy = "^1.5.1" -types-requests = "^2.31.0.2" -# flake8-black may be unnecessary? -flake8-black = "^0.3.6" [build-system] requires = ["poetry-core>=1.0.0"] @@ -88,7 +84,3 @@ build-backend = "poetry.core.masonry.api" [tool.pymarkdown] disable-rules = "line-length,no-inline-html" - -[tool.mypy] -check_untyped_defs = true -ignore_missing_imports = true diff --git a/wikiteam3/dumpgenerator/__init__.py b/wikiteam3/dumpgenerator/__init__.py old mode 100644 new mode 100755 index e69de29..b5da8b1 --- a/wikiteam3/dumpgenerator/__init__.py +++ b/wikiteam3/dumpgenerator/__init__.py @@ -0,0 +1,26 @@ +#!/usr/bin/env python3 + +# DumpGenerator A generator of dumps for wikis +# Copyright (C) 2011-2018 WikiTeam developers +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +# To learn more, read the documentation: +# https://github.com/WikiTeam/wikiteam/wiki + + +from wikiteam3.dumpgenerator.dump import DumpGenerator + + +def main(): + DumpGenerator() diff --git a/wikiteam3/dumpgenerator/__main__.py b/wikiteam3/dumpgenerator/__main__.py index 4981f11..0321cad 100644 --- a/wikiteam3/dumpgenerator/__main__.py +++ b/wikiteam3/dumpgenerator/__main__.py @@ -1,32 +1,6 @@ -#!/usr/bin/env python3 - -# DumpGenerator A generator of dumps for wikis -# Copyright (C) 2011-2018 WikiTeam developers -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see . - -# To learn more, read the documentation: -# https://github.com/WikiTeam/wikiteam/wiki - - -from wikiteam3.dumpgenerator.dump import DumpGenerator - - -def main(): - DumpGenerator() - - if __name__ == "__main__": import sys + from .__init__ import main + sys.exit(main()) diff --git a/wikiteam3/dumpgenerator/api/__init__.py b/wikiteam3/dumpgenerator/api/__init__.py index 3748c5e..7d86c17 100644 --- a/wikiteam3/dumpgenerator/api/__init__.py +++ b/wikiteam3/dumpgenerator/api/__init__.py @@ -2,5 +2,3 @@ from .api import checkAPI, checkRetryAPI, mwGetAPIAndIndex from .get_json import getJSON from .handle_status_code import handleStatusCode from .wiki_check import getWikiEngine - -__all__ = [checkAPI, checkRetryAPI, mwGetAPIAndIndex, getJSON, handleStatusCode, getWikiEngine] # type: ignore diff --git a/wikiteam3/dumpgenerator/api/api.py b/wikiteam3/dumpgenerator/api/api.py index f3d3948..0fa855d 100644 --- a/wikiteam3/dumpgenerator/api/api.py +++ b/wikiteam3/dumpgenerator/api/api.py @@ -1,6 +1,7 @@ import re -from typing import Any, Literal, Optional -from urllib.parse import urljoin, urlparse +import time +from typing import * +from urllib.parse import urljoin, urlparse, urlunparse import mwclient import requests @@ -10,8 +11,7 @@ from wikiteam3.utils import getUserAgent from .get_json import getJSON -# api="", session: requests.Session = None -def checkAPI(api: str, session: requests.Session): +def checkAPI(api="", session: requests.Session = None): """Checking API availability""" global cj # handle redirects @@ -34,31 +34,29 @@ def checkAPI(api: str, session: requests.Session): "MediaWiki API URL not found or giving error: HTTP %d" % r.status_code ) return None - if r is not None: - if "MediaWiki API is not enabled for this site." in r.text: - return None - try: - result = getJSON(r) - index = None - if result: - try: - index = ( - result["query"]["general"]["server"] - + result["query"]["general"]["script"] - ) - return (True, index, api) - except KeyError: - print("MediaWiki API seems to work but returned no index URL") - return (True, None, api) - except ValueError: - print(repr(r.text)) - print("MediaWiki API returned data we could not parse") - return None + if "MediaWiki API is not enabled for this site." in r.text: + return None + try: + result = getJSON(r) + index = None + if result: + try: + index = ( + result["query"]["general"]["server"] + + result["query"]["general"]["script"] + ) + return (True, index, api) + except KeyError: + print("MediaWiki API seems to work but returned no index URL") + return (True, None, api) + except ValueError: + print(repr(r.text)) + print("MediaWiki API returned data we could not parse") + return None return None -# url="" -def mwGetAPIAndIndex(url: str, session: requests.Session): +def mwGetAPIAndIndex(url="", session: requests.Session = None): """Returns the MediaWiki API and Index.php""" api = "" @@ -110,21 +108,18 @@ def mwGetAPIAndIndex(url: str, session: requests.Session): return api, index -# api="", apiclient=False -def checkRetryAPI(api: str, apiclient: bool, session: requests.Session): +def checkRetryAPI(api="", apiclient=False, session: requests.Session = None): """Call checkAPI and mwclient if necessary""" - check: (tuple[Literal[True], Any, str] | tuple[Literal[True], None, str] | None) + check = None try: check = checkAPI(api, session=session) except requests.exceptions.ConnectionError as e: print(f"Connection error: {str(e)}") - check = None if check and apiclient: apiurl = urlparse(api) try: - # Returns a value, but we're just checking for an error here - mwclient.Site( + site = mwclient.Site( apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=apiurl.scheme, @@ -143,14 +138,13 @@ def checkRetryAPI(api: str, apiclient: bool, session: requests.Session): ) try: - # Returns a value, but we're just checking for an error here - mwclient.Site( + site = mwclient.Site( apiurl.netloc, apiurl.path.replace("api.php", ""), scheme=newscheme, pool=session, ) except KeyError: - check = False # type: ignore + check = False - return check, api # type: ignore + return check, api diff --git a/wikiteam3/dumpgenerator/api/get_json.py b/wikiteam3/dumpgenerator/api/get_json.py index bd1aa48..7a3b227 100644 --- a/wikiteam3/dumpgenerator/api/get_json.py +++ b/wikiteam3/dumpgenerator/api/get_json.py @@ -8,6 +8,6 @@ def getJSON(request: requests.Response): # request.encoding = request.apparent_encoding try: return request.json() - except Exception: + except: # Maybe an older API version which did not return correct JSON return {} diff --git a/wikiteam3/dumpgenerator/api/index_check.py b/wikiteam3/dumpgenerator/api/index_check.py index d29fa2c..50ae58c 100644 --- a/wikiteam3/dumpgenerator/api/index_check.py +++ b/wikiteam3/dumpgenerator/api/index_check.py @@ -3,10 +3,9 @@ import re import requests -# index="", cookies="", session=None -def checkIndex(index: str, cookies: str, session: requests.Session): +def checkIndex(index="", cookies="", session: requests.Session = None): """Checking index.php availability""" - r = session.post(url=index, data={"title": "Special:Version"}, timeout=30) # type: ignore + r = session.post(url=index, data={"title": "Special:Version"}, timeout=30) if r.status_code >= 400: print(f"ERROR: The wiki returned status code HTTP {r.status_code}") return False diff --git a/wikiteam3/dumpgenerator/api/namespaces.py b/wikiteam3/dumpgenerator/api/namespaces.py index 93c5f70..b9fbbde 100644 --- a/wikiteam3/dumpgenerator/api/namespaces.py +++ b/wikiteam3/dumpgenerator/api/namespaces.py @@ -1,50 +1,53 @@ import re -import requests - from wikiteam3.dumpgenerator.api import getJSON from wikiteam3.dumpgenerator.cli import Delay from wikiteam3.dumpgenerator.config import Config -def getNamespacesScraper(config: Config, session: requests.Session): +def getNamespacesScraper(config: Config = None, session=None): """Hackishly gets the list of namespaces names and ids from the dropdown in the HTML of Special:AllPages""" """Function called if no API is available""" namespaces = config.namespaces - # namespacenames = {0: ""} # main is 0, no prefix + namespacenames = {0: ""} # main is 0, no prefix if namespaces: r = session.post( - url=config.index, params={"title": "Special:Allpages"}, timeout=30 # type: ignore + url=config.index, params={"title": "Special:Allpages"}, timeout=30 ) raw = r.text - Delay(config=config) + Delay(config=config, session=session) # [^>]*? to include selected="selected" m = re.compile( r'' ).finditer(raw) if "all" in namespaces: - namespaces = [int(i.group("namespaceid")) for i in m] - # namespacenames[int(i.group("namespaceid"))] = i.group("namespacename") + namespaces = [] + for i in m: + namespaces.append(int(i.group("namespaceid"))) + namespacenames[int(i.group("namespaceid"))] = i.group("namespacename") else: - namespaces2 = [ - int(i.group("namespaceid")) - for i in m - if int(i.group("namespaceid")) in namespaces - ] + # check if those namespaces really exist in this wiki + namespaces2 = [] + for i in m: + if int(i.group("namespaceid")) in namespaces: + namespaces2.append(int(i.group("namespaceid"))) + namespacenames[int(i.group("namespaceid"))] = i.group( + "namespacename" + ) namespaces = namespaces2 else: namespaces = [0] namespaces = list(set(namespaces)) # uniques print("%d namespaces found" % (len(namespaces))) - return namespaces + return namespaces, namespacenames -def getNamespacesAPI(config: Config, session: requests.Session): +def getNamespacesAPI(config: Config = None, session=None): """Uses the API to get the list of namespaces names and ids""" namespaces = config.namespaces - # namespacenames = {0: ""} # main is 0, no prefix + namespacenames = {0: ""} # main is 0, no prefix if namespaces: r = session.get( url=config.api, @@ -57,34 +60,37 @@ def getNamespacesAPI(config: Config, session: requests.Session): timeout=30, ) result = getJSON(r) - Delay(config=config) + Delay(config=config, session=session) try: nsquery = result["query"]["namespaces"] - except KeyError as ke: + except KeyError: print("Error: could not get namespaces from the API request.") print("HTTP %d" % r.status_code) print(r.text) - raise ke + return None if "all" in namespaces: - namespaces = [int(i) for i in nsquery.keys() if int(i) >= 0] - # -1: Special, -2: Media, excluding - # namespacenames[int(i)] = nsquery[i]["*"] + namespaces = [] + for i in nsquery.keys(): + if int(i) < 0: # -1: Special, -2: Media, excluding + continue + namespaces.append(int(i)) + namespacenames[int(i)] = nsquery[i]["*"] else: # check if those namespaces really exist in this wiki namespaces2 = [] for i in nsquery.keys(): - # bi = i + bi = i i = int(i) if i < 0: # -1: Special, -2: Media, excluding continue if i in namespaces: namespaces2.append(i) - # namespacenames[i] = nsquery[bi]["*"] + namespacenames[i] = nsquery[bi]["*"] namespaces = namespaces2 else: namespaces = [0] namespaces = list(set(namespaces)) # uniques print("%d namespaces found" % (len(namespaces))) - return namespaces + return namespaces, namespacenames diff --git a/wikiteam3/dumpgenerator/api/page_titles.py b/wikiteam3/dumpgenerator/api/page_titles.py index d1c9b29..4e12ba2 100644 --- a/wikiteam3/dumpgenerator/api/page_titles.py +++ b/wikiteam3/dumpgenerator/api/page_titles.py @@ -1,11 +1,9 @@ import re -from typing import List +import sys from urllib.parse import urlparse import mwclient -import requests from file_read_backwards import FileReadBackwards -from mwclient.page import Page from wikiteam3.dumpgenerator.api.namespaces import ( getNamespacesAPI, @@ -17,10 +15,10 @@ from wikiteam3.utils import cleanHTML, domain2prefix, undoHTMLEntities from wikiteam3.utils.monkey_patch import DelaySession -def getPageTitlesAPI(config: Config, session: requests.Session): +def getPageTitlesAPI(config: Config = None, session=None): """Uses the API to get the list of page titles""" titles = [] - namespaces: List[int] = getNamespacesAPI(config=config, session=session) + namespaces, namespacenames = getNamespacesAPI(config=config, session=session) # apply delay to the session for mwclient.Site.allpages() delay_session = DelaySession( @@ -40,11 +38,10 @@ def getPageTitlesAPI(config: Config, session: requests.Session): scheme=apiurl.scheme, pool=session, ) - for page in site.allpages(namespace=str(namespace)): - if page is Page: - title = page.name - titles.append(title) - yield title + for page in site.allpages(namespace=namespace): + title = page.name + titles.append(title) + yield title if len(titles) != len(set(titles)): print("Probably a loop, switching to next namespace") @@ -53,10 +50,10 @@ def getPageTitlesAPI(config: Config, session: requests.Session): delay_session.release() -def getPageTitlesScraper(config: Config, session: requests.Session): +def getPageTitlesScraper(config: Config = None, session=None): """Scrape the list of page titles from Special:Allpages""" titles = [] - namespaces = getNamespacesScraper(config=config, session=session) + namespaces, namespacenames = getNamespacesScraper(config=config, session=session) r_title = r'title="(?P[^>]+)">' r_suballpages1 = r'&from=(?P<from>[^>"]+)&to=(?P<to>[^>"]+)">' r_suballpages2 = r'Special:Allpages/(?P<from>[^>"]+)">' @@ -78,7 +75,7 @@ def getPageTitlesScraper(config: Config, session: requests.Session): elif re.search(r_suballpages3, raw): r_suballpages = r_suballpages3 c = 0 - # oldfr = "" + oldfr = "" checked_suballpages = [] rawacum = raw while r_suballpages and re.search(r_suballpages, raw) and c < deep: @@ -108,10 +105,10 @@ def getPageTitlesScraper(config: Config, session: requests.Session): if name not in checked_suballpages: # to avoid reload dupe subpages links checked_suballpages.append(name) - Delay(config=config) + Delay(config=config, session=session) # print ('Fetching URL: ', url) r = session.get(url=url, timeout=10) - raw = r.text + raw = str(r.text) raw = cleanHTML(raw) rawacum += raw # merge it after removed junk print( @@ -125,26 +122,27 @@ def getPageTitlesScraper(config: Config, session: requests.Session): "pages", ) - Delay(config=config) + Delay(config=config, session=session) assert ( currfr is not None ), "re.search found the pattern, but re.finditer fails, why?" - # oldfr = currfr + oldfr = currfr c += 1 c = 0 m = re.compile(r_title).finditer(rawacum) for i in m: t = undoHTMLEntities(text=i.group("title")) - if not t.startswith("Special:") and t not in titles: - titles.append(t) - c += 1 + if not t.startswith("Special:"): + if t not in titles: + titles.append(t) + c += 1 print(" %d titles retrieved in the namespace %d" % (c, namespace)) return titles -def getPageTitles(config: Config, session: requests.Session): +def getPageTitles(config: Config = None, session=None): """Get list of page titles""" # http://en.wikipedia.org/wiki/Special:AllPages # http://wiki.archiveteam.org/index.php?title=Special:AllPages @@ -170,7 +168,7 @@ def getPageTitles(config: Config, session: requests.Session): if config.api: try: titles = getPageTitlesAPI(config=config, session=session) - except Exception: + except: print("Error: could not get page titles from the API") titles = getPageTitlesScraper(config=config, session=session) elif config.index: @@ -195,7 +193,7 @@ def getPageTitles(config: Config, session: requests.Session): def checkTitleOk( - config: Config, + config: Config = None, ): try: with FileReadBackwards( @@ -210,13 +208,13 @@ def checkTitleOk( lasttitle = frb.readline().strip() if lasttitle == "": lasttitle = frb.readline().strip() - except Exception: + except: lasttitle = "" # probably file does not exists return lasttitle == "--END--" -def readTitles(config: Config, session: requests.Session, start: str, batch: bool): +def readTitles(config: Config = None, session=None, start=None, batch=False): """Read title list from a file, from the title "start" """ if not checkTitleOk(config): getPageTitles(config=config, session=session) @@ -227,7 +225,7 @@ def readTitles(config: Config, session: requests.Session, start: str, batch: boo titlesfile = open(f"{config.path}/{titlesfilename}", encoding="utf-8") titlelist = [] - seeking = start != "" + seeking = start is not None with titlesfile as f: for line in f: title = line.strip() diff --git a/wikiteam3/dumpgenerator/api/wiki_check.py b/wikiteam3/dumpgenerator/api/wiki_check.py index b5d9b0d..93e0465 100644 --- a/wikiteam3/dumpgenerator/api/wiki_check.py +++ b/wikiteam3/dumpgenerator/api/wiki_check.py @@ -5,13 +5,13 @@ import requests from wikiteam3.utils import getUserAgent -def getWikiEngine(url: str, session: requests.Session) -> str: +def getWikiEngine(url="", session: requests.Session = None) -> str: """Returns the wiki engine of a URL, if known""" if not session: session = requests.Session() # Create a new session session.headers.update({"User-Agent": getUserAgent()}) - r = session.post(url=url, timeout=30) # type: ignore + r = session.post(url=url, timeout=30) if r.status_code == 405 or not r.text: r = session.get(url=url, timeout=120) result = r.text diff --git a/wikiteam3/dumpgenerator/cli/cli.py b/wikiteam3/dumpgenerator/cli/cli.py index bad2e2e..582ca86 100644 --- a/wikiteam3/dumpgenerator/cli/cli.py +++ b/wikiteam3/dumpgenerator/cli/cli.py @@ -6,7 +6,7 @@ import os import queue import re import sys -from typing import Any, Dict, Literal, Tuple +from typing import * import requests import urllib3 @@ -15,9 +15,10 @@ from wikiteam3.dumpgenerator.api import checkRetryAPI, getWikiEngine, mwGetAPIAn from wikiteam3.dumpgenerator.api.index_check import checkIndex from wikiteam3.dumpgenerator.config import Config, newConfig from wikiteam3.dumpgenerator.version import getVersion -from wikiteam3.utils import domain2prefix, getUserAgent, mod_requests_text, uniLogin -from wikiteam3.utils.user_agent import setupUserAgent +from wikiteam3.utils import domain2prefix, getUserAgent, mod_requests_text +from wikiteam3.utils.login import uniLogin +from ...utils.user_agent import setupUserAgent from .delay import Delay @@ -222,13 +223,13 @@ def getParameters(params=None) -> Tuple[Config, Dict]: ######################################## # Create session - mod_requests_text(requests) # type: ignore # monkey patch + mod_requests_text(requests) # monkey patch session = requests.Session() # Disable SSL verification if args.insecure: session.verify = False - urllib3.disable_warnings() + requests.packages.urllib3.disable_warnings() print("WARNING: SSL certificate verification disabled") # Custom session retry @@ -240,12 +241,14 @@ def getParameters(params=None) -> Tuple[Config, Dict]: class CustomRetry(Retry): def increment(self, method=None, url=None, *args, **kwargs): if "_pool" in kwargs: - conn: urllib3.connectionpool.HTTPSConnectionPool = kwargs["_pool"] + conn = kwargs[ + "_pool" + ] # type: urllib3.connectionpool.HTTPSConnectionPool if "response" in kwargs: try: # drain conn in advance so that it won't be put back into conn.pool kwargs["response"].drain_conn() - except Exception: + except: pass # Useless, retry happens inside urllib3 # for adapters in session.adapters.values(): @@ -253,12 +256,12 @@ def getParameters(params=None) -> Tuple[Config, Dict]: # adapters.poolmanager.clear() # Close existing connection so that a new connection will be used - if hasattr(conn, "pool") and conn.pool is not None: + if hasattr(conn, "pool"): pool = conn.pool # type: queue.Queue try: # Don't directly use this, This closes connection pool by making conn.pool = None conn.close() - except Exception: + except: pass conn.pool = pool return super().increment(method=method, url=url, *args, **kwargs) @@ -271,8 +274,7 @@ def getParameters(params=None) -> Tuple[Config, Dict]: msg = "req retry (%s)" % response.status else: msg = None - # config=None - Delay(config=config, msg=msg, delay=backoff) + Delay(config=None, session=session, msg=msg, delay=backoff) __retries__ = CustomRetry( total=int(args.retries), @@ -290,7 +292,7 @@ def getParameters(params=None) -> Tuple[Config, Dict]: ) session.mount("https://", HTTPAdapter(max_retries=__retries__)) session.mount("http://", HTTPAdapter(max_retries=__retries__)) - except Exception: + except: # Our urllib3/requests is too old pass @@ -299,7 +301,7 @@ def getParameters(params=None) -> Tuple[Config, Dict]: if args.cookies: cj.load(args.cookies) print("Using cookies from %s" % args.cookies) - session.cookies = cj # type: ignore + session.cookies = cj # Setup user agent session.headers.update({"User-Agent": getUserAgent()}) @@ -310,17 +312,17 @@ def getParameters(params=None) -> Tuple[Config, Dict]: session.auth = (args.user, args.password) # Execute meta info params - if args.wiki and args.get_wiki_engine: - print(getWikiEngine(url=args.wiki, session=session)) - sys.exit(0) + if args.wiki: + if args.get_wiki_engine: + print(getWikiEngine(url=args.wiki, session=session)) + sys.exit(0) # Get API and index and verify - api: str = args.api or "" - index: str = args.index or "" + api = args.api if args.api else "" + index = args.index if args.index else "" if api == "" or index == "": if args.wiki: if getWikiEngine(args.wiki, session=session) == "MediaWiki": - index2: str api2, index2 = mwGetAPIAndIndex(args.wiki, session=session) if not api: api = api2 @@ -337,12 +339,9 @@ def getParameters(params=None) -> Tuple[Config, Dict]: # print (api) # print (index) - index2 = "" + index2 = None - check: ( - tuple[Literal[True], Any, str] | tuple[Literal[True], None, str] | None - ) = False # type: ignore - checkedapi = "" + check, checkedapi = False, None if api: check, checkedapi = checkRetryAPI( api=api, @@ -350,9 +349,9 @@ def getParameters(params=None) -> Tuple[Config, Dict]: session=session, ) - if api != "" and check: + if api and check: # Replace the index URL we got from the API check - index2 = str(check[1]) + index2 = check[1] api = checkedapi print("API is OK: ", checkedapi) else: @@ -392,10 +391,8 @@ def getParameters(params=None) -> Tuple[Config, Dict]: try: index = "/".join(index.split("/")[:-1]) except AttributeError: - index = "" - if index != "" and checkIndex( - index=index, cookies=args.cookies, session=session - ): + index = None + if index and checkIndex(index=index, cookies=args.cookies, session=session): print("index.php is OK") else: print("Error in index.php.") @@ -476,7 +473,7 @@ def getParameters(params=None) -> Tuple[Config, Dict]: # calculating path, if not defined by user with --path= if not config.path: config.path = "./{}-{}-wikidump".format( - domain2prefix(config=config), + domain2prefix(config=config, session=session), config.date, ) print("No --path argument provided. Defaulting to:") diff --git a/wikiteam3/dumpgenerator/cli/delay.py b/wikiteam3/dumpgenerator/cli/delay.py index 64e64cd..7ebbd02 100644 --- a/wikiteam3/dumpgenerator/cli/delay.py +++ b/wikiteam3/dumpgenerator/cli/delay.py @@ -1,3 +1,5 @@ +import itertools +import sys import threading import time @@ -19,7 +21,7 @@ class Delay: time.sleep(0.3) - def __init__(self, config: Config, msg=None, delay=None): + def __init__(self, config: Config = None, session=None, msg=None, delay=None): """Add a delay if configured for that""" self.ellipses: str = "." diff --git a/wikiteam3/dumpgenerator/config.py b/wikiteam3/dumpgenerator/config.py index 97b6442..21dbff3 100644 --- a/wikiteam3/dumpgenerator/config.py +++ b/wikiteam3/dumpgenerator/config.py @@ -19,12 +19,10 @@ config = { } """ -import contextlib import dataclasses import json import sys -from dataclasses import field -from typing import List +from typing import * def _dataclass_from_dict(klass_or_obj, d): @@ -45,7 +43,7 @@ class Config: retries: int = 0 path: str = "" logs: bool = False - date: str = "" + date: str = False # URL params index: str = "" @@ -58,8 +56,8 @@ class Config: xmlrevisions: bool = False xmlrevisions_page: bool = False images: bool = False - namespaces: List[int] = field(default_factory=lambda: []) - exnamespaces: List[int] = field(default_factory=lambda: []) + namespaces: List[int] = None + exnamespaces: List[int] = None api_chunksize: int = 0 # arvlimit, ailimit, etc export: str = "" # Special:Export page name @@ -75,21 +73,24 @@ def newConfig(configDict) -> Config: return _dataclass_from_dict(Config, configDict) -def loadConfig(config: Config, configfilename=""): +def loadConfig(config: Config = None, configfilename=""): """Load config file""" configDict = dataclasses.asdict(config) if config.path: - with contextlib.suppress(Exception): + try: with open(f"{config.path}/{configfilename}", encoding="utf-8") as infile: configDict.update(json.load(infile)) return newConfig(configDict) + except: + pass + print("There is no config file. we can't resume. Start a new dump.") sys.exit() -def saveConfig(config: Config, configfilename=""): +def saveConfig(config: Config = None, configfilename=""): """Save config file""" with open(f"{config.path}/{configfilename}", "w", encoding="utf-8") as outfile: diff --git a/wikiteam3/dumpgenerator/dump/generator.py b/wikiteam3/dumpgenerator/dump/generator.py index 41fa132..80ca3c4 100644 --- a/wikiteam3/dumpgenerator/dump/generator.py +++ b/wikiteam3/dumpgenerator/dump/generator.py @@ -1,12 +1,10 @@ try: import contextlib - - # import http.cookiejar + import http.cookiejar import os import re import sys import traceback - from typing import List from file_read_backwards import FileReadBackwards @@ -22,7 +20,7 @@ except ImportError: ) sys.exit(1) -from typing import Dict +from typing import * from wikiteam3.dumpgenerator.cli import bye, getParameters, welcome from wikiteam3.dumpgenerator.config import Config, loadConfig, saveConfig @@ -77,7 +75,7 @@ class DumpGenerator: else contextlib.nullcontext() ): print(welcome()) - print(f"Analysing {config.api or config.index}") + print(f"Analysing {config.api if config.api else config.index}") # creating path or resuming if desired c = 2 @@ -126,58 +124,57 @@ class DumpGenerator: bye() @staticmethod - def createNewDump(config: Config, other: Dict): - # other: Dict = None + def createNewDump(config: Config = None, other: Dict = None): # we do lazy title dumping here :) images = [] print("Trying generating a new dump into a new directory...") if config.xml: - generateXMLDump(config=config, resume=False, session=other["session"]) + generateXMLDump(config=config, session=other["session"]) checkXMLIntegrity(config=config, session=other["session"]) if config.images: images += Image.getImageNames(config=config, session=other["session"]) - Image.saveImageNames(config=config, images=images) + Image.saveImageNames(config=config, images=images, session=other["session"]) Image.generateImageDump( config=config, other=other, images=images, session=other["session"] ) if config.logs: saveLogs(config=config, session=other["session"]) - # other: Dict = None @staticmethod - def resumePreviousDump(config: Config, other: Dict): - images: List[str] = [] + def resumePreviousDump(config: Config = None, other: Dict = None): + images = [] print("Resuming previous dump process...") if config.xml: # checking xml dump xmliscomplete = False lastxmltitle = None lastxmlrevid = None - - # Exception means probably file does not exist - with contextlib.suppress(Exception): + try: with FileReadBackwards( "%s/%s-%s-%s.xml" % ( config.path, - domain2prefix(config=config), + domain2prefix(config=config, session=other["session"]), config.date, "current" if config.curonly else "history", ), encoding="utf-8", ) as frb: - for line in frb: - if line.strip() == "</mediawiki>": + for l in frb: + if l.strip() == "</mediawiki>": # xml dump is complete xmliscomplete = True break - if xmlrevid := re.search(r" <id>([^<]+)</id>", line): + if xmlrevid := re.search(r" <id>([^<]+)</id>", l): lastxmlrevid = int(xmlrevid.group(1)) - if xmltitle := re.search(r"<title>([^<]+)", line): + if xmltitle := re.search(r"([^<]+)", l): lastxmltitle = undoHTMLEntities(text=xmltitle.group(1)) break + except: + pass # probably file does not exists + if xmliscomplete: print("XML dump was completed in the previous session") elif lastxmltitle: @@ -193,7 +190,7 @@ class DumpGenerator: else: # corrupt? only has XML header? print("XML is corrupt? Regenerating...") - generateXMLDump(config=config, resume=False, session=other["session"]) + generateXMLDump(config=config, session=other["session"]) if config.images: # load images list @@ -206,9 +203,7 @@ class DumpGenerator: if os.path.exists(imagesFilePath): with open(imagesFilePath) as f: lines = f.read().splitlines() - images.extend( - line.split("\t") for line in lines if re.search(r"\t", line) - ) + images.extend(l.split("\t") for l in lines if re.search(r"\t", l)) if len(lines) == 0: # empty file lastimage = "--EMPTY--" if not lastimage: @@ -231,14 +226,16 @@ class DumpGenerator: Image.saveImageNames(config=config, images=images) # checking images directory listdir = [] - with contextlib.suppress(OSError): + try: listdir = os.listdir(f"{config.path}/images") + except OSError: + pass # probably directory does not exist listdir = set(listdir) c_desc = 0 c_images = 0 c_checked = 0 for filename, url, uploader, size, sha1 in images: - # lastfilename = filename + lastfilename = filename if other["filenamelimit"] < len(filename.encode("utf-8")): logerror( config=config, diff --git a/wikiteam3/dumpgenerator/dump/image/image.py b/wikiteam3/dumpgenerator/dump/image/image.py index f5eedfc..b79e9eb 100644 --- a/wikiteam3/dumpgenerator/dump/image/image.py +++ b/wikiteam3/dumpgenerator/dump/image/image.py @@ -4,7 +4,7 @@ import re import sys import time import urllib.parse -from typing import Dict, List +from typing import Dict, List, Optional import requests @@ -20,19 +20,19 @@ from wikiteam3.utils import cleanHTML, domain2prefix, sha1File, undoHTMLEntities class Image: @staticmethod - def getXMLFileDesc(config: Config, title: str, session: requests.Session): + def getXMLFileDesc(config: Config = None, title="", session=None): """Get XML for image description page""" - config.curonly = True # tricky to get only the most recent desc + config.curonly = 1 # tricky to get only the most recent desc return "".join( list(getXMLPage(config=config, title=title, verbose=False, session=session)) ) - # other: Dict = None, - # images: List[List] = None, - # session: requests.Session = None, @staticmethod def generateImageDump( - config: Config, other: Dict, images: List[List], session: requests.Session + config: Config = None, + other: Dict = None, + images: List[List] = None, + session: requests.Session = None, ): """Save files and descriptions using a file list\n Deprecated: `start` is not used anymore.""" @@ -49,9 +49,7 @@ class Image: bypass_cdn_image_compression: bool = other["bypass_cdn_image_compression"] - def modify_params( - params: Dict[str, (str | int)] = {} - ) -> Dict[str, (str | int)]: + def modify_params(params: Optional[Dict] = None) -> Dict: """bypass Cloudflare Polish (image optimization)""" if params is None: params = {} @@ -103,7 +101,7 @@ class Image: + "we will not try to download it...", ) else: - Delay(config=config) + Delay(config=config, session=session) original_url = url r = session.head(url=url, params=modify_params(), allow_redirects=True) check_response(r) @@ -118,20 +116,17 @@ class Image: check_response(r) # Try to fix a broken HTTP to HTTPS redirect - if ( - r.status_code == 404 - and original_url_redirected - and ( + if r.status_code == 404 and original_url_redirected: + if ( original_url.split("://")[0] == "http" and url.split("://")[0] == "https" - ) - ): - url = "https://" + original_url.split("://")[1] - # print 'Maybe a broken http to https redirect, trying ', url - r = session.get( - url=url, params=modify_params(), allow_redirects=False - ) - check_response(r) + ): + url = "https://" + original_url.split("://")[1] + # print 'Maybe a broken http to https redirect, trying ', url + r = session.get( + url=url, params=modify_params(), allow_redirects=False + ) + check_response(r) if r.status_code == 200: try: @@ -165,7 +160,7 @@ class Image: if os.path.isfile(f"{filename3}.desc"): toContinue += 1 else: - Delay(config=config) + Delay(config=config, session=session) # saving description if any title = f"Image:{filename}" try: @@ -236,7 +231,7 @@ class Image: ) @staticmethod - def getImageNames(config: Config, session: requests.Session): + def getImageNames(config: Config = None, session: requests.Session = None): """Get list of image names""" print(")Retrieving image filenames") @@ -256,7 +251,7 @@ class Image: return images @staticmethod - def getImageNamesScraper(config: Config, session: requests.Session): + def getImageNamesScraper(config: Config = None, session: requests.Session = None): """Retrieve file list: filename, url, uploader""" images = [] @@ -273,7 +268,7 @@ class Image: timeout=30, ) raw = r.text - Delay(config=config) + Delay(config=config, session=session) # delicate wiki if re.search( r"(?i)(allowed memory size of \d+ bytes exhausted|Call to a member function getURL)", @@ -350,7 +345,7 @@ class Image: return images @staticmethod - def getImageNamesAPI(config: Config, session: requests.Session): + def getImageNamesAPI(config: Config = None, session: requests.Session = None): """Retrieve file list: filename, url, uploader, size, sha1""" # # Commented by @yzqzss: # https://www.mediawiki.org/wiki/API:Allpages @@ -382,7 +377,7 @@ class Image: r = session.get(url=config.api, params=params, timeout=30) handleStatusCode(r) jsonimages = getJSON(r) - Delay(config=config) + Delay(config=config, session=session) if "query" in jsonimages: countImages += len(jsonimages["query"]["allimages"]) @@ -470,7 +465,7 @@ class Image: r = session.get(url=config.api, params=params, timeout=30) handleStatusCode(r) jsonimages = getJSON(r) - Delay(config=config) + Delay(config=config, session=session) if "query" not in jsonimages: # if the API doesn't return query data, then we're done @@ -517,7 +512,7 @@ class Image: return images @staticmethod - def saveImageNames(config: Config, images: List[List]): + def saveImageNames(config: Config = None, images: List[List] = None, session=None): """Save image list in a file, including filename, url, uploader, size and sha1""" imagesfilename = "{}-{}-images.txt".format( @@ -550,7 +545,7 @@ class Image: print("Image filenames and URLs saved at...", imagesfilename) @staticmethod - def curateImageURL(config: Config, url=""): + def curateImageURL(config: Config = None, url=""): """Returns an absolute URL for an image, adding the domain if missing""" if config.index: diff --git a/wikiteam3/dumpgenerator/dump/misc/index_php.py b/wikiteam3/dumpgenerator/dump/misc/index_php.py index ac96adf..b2ae327 100644 --- a/wikiteam3/dumpgenerator/dump/misc/index_php.py +++ b/wikiteam3/dumpgenerator/dump/misc/index_php.py @@ -1,22 +1,20 @@ import os -import requests - from wikiteam3.dumpgenerator.cli import Delay from wikiteam3.dumpgenerator.config import Config from wikiteam3.utils import removeIP -def saveIndexPHP(config: Config, session: requests.Session): +def saveIndexPHP(config: Config = None, session=None): """Save index.php as .html, to preserve license details available at the botom of the page""" if os.path.exists(f"{config.path}/index.html"): print("index.html exists, do not overwrite") else: print("Downloading index.php (Main Page) as index.html") - r = session.post(url=config.index, params=None, timeout=10) # type: ignore - raw = r.text - Delay(config=config) + r = session.post(url=config.index, params=None, timeout=10) + raw = str(r.text) + Delay(config=config, session=session) raw = removeIP(raw=raw) with open(f"{config.path}/index.html", "w", encoding="utf-8") as outfile: outfile.write(raw) diff --git a/wikiteam3/dumpgenerator/dump/misc/site_info.py b/wikiteam3/dumpgenerator/dump/misc/site_info.py index a357017..0a8160f 100644 --- a/wikiteam3/dumpgenerator/dump/misc/site_info.py +++ b/wikiteam3/dumpgenerator/dump/misc/site_info.py @@ -1,61 +1,58 @@ import json import os -import requests - from wikiteam3.dumpgenerator.api import getJSON from wikiteam3.dumpgenerator.cli import Delay from wikiteam3.dumpgenerator.config import Config -def saveSiteInfo(config: Config, session: requests.Session): +def saveSiteInfo(config: Config = None, session=None): """Save a file with site info""" if not config.api: return if os.path.exists(f"{config.path}/siteinfo.json"): print("siteinfo.json exists, do not overwrite") - return - - print("Downloading site info as siteinfo.json") + else: + print("Downloading site info as siteinfo.json") - # MediaWiki 1.13+ - r = session.get( - url=config.api, - params={ - "action": "query", - "meta": "siteinfo", - "siprop": "general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo", - "sinumberingroup": 1, - "format": "json", - }, - timeout=10, - ) - # MediaWiki 1.11-1.12 - if "query" not in getJSON(r): - r = session.get( - url=config.api, - params={ - "action": "query", - "meta": "siteinfo", - "siprop": "general|namespaces|statistics|dbrepllag|interwikimap", - "format": "json", - }, - timeout=10, - ) - # MediaWiki 1.8-1.10 - if "query" not in getJSON(r): + # MediaWiki 1.13+ r = session.get( url=config.api, params={ "action": "query", "meta": "siteinfo", - "siprop": "general|namespaces", + "siprop": "general|namespaces|statistics|dbrepllag|interwikimap|namespacealiases|specialpagealiases|usergroups|extensions|skins|magicwords|fileextensions|rightsinfo", + "sinumberingroup": 1, "format": "json", }, timeout=10, ) - result = getJSON(r) - Delay(config=config) - with open(f"{config.path}/siteinfo.json", "w", encoding="utf-8") as outfile: - outfile.write(json.dumps(result, indent=4, sort_keys=True)) + # MediaWiki 1.11-1.12 + if "query" not in getJSON(r): + r = session.get( + url=config.api, + params={ + "action": "query", + "meta": "siteinfo", + "siprop": "general|namespaces|statistics|dbrepllag|interwikimap", + "format": "json", + }, + timeout=10, + ) + # MediaWiki 1.8-1.10 + if "query" not in getJSON(r): + r = session.get( + url=config.api, + params={ + "action": "query", + "meta": "siteinfo", + "siprop": "general|namespaces", + "format": "json", + }, + timeout=10, + ) + result = getJSON(r) + Delay(config=config, session=session) + with open(f"{config.path}/siteinfo.json", "w", encoding="utf-8") as outfile: + outfile.write(json.dumps(result, indent=4, sort_keys=True)) diff --git a/wikiteam3/dumpgenerator/dump/misc/special_logs.py b/wikiteam3/dumpgenerator/dump/misc/special_logs.py index 666c8a1..0b35939 100644 --- a/wikiteam3/dumpgenerator/dump/misc/special_logs.py +++ b/wikiteam3/dumpgenerator/dump/misc/special_logs.py @@ -1,10 +1,8 @@ -import requests - from wikiteam3.dumpgenerator.cli import Delay from wikiteam3.dumpgenerator.config import Config -def saveLogs(config: Config, session: requests.Session): +def saveLogs(config: Config = None, session=None): """Save Special:Log""" # get all logs from Special:Log """parse @@ -22,4 +20,4 @@ def saveLogs(config: Config, session: requests.Session): """ - Delay(config=config) + Delay(config=config, session=session) diff --git a/wikiteam3/dumpgenerator/dump/misc/special_version.py b/wikiteam3/dumpgenerator/dump/misc/special_version.py index c15e175..5547337 100644 --- a/wikiteam3/dumpgenerator/dump/misc/special_version.py +++ b/wikiteam3/dumpgenerator/dump/misc/special_version.py @@ -1,13 +1,11 @@ import os -import requests - from wikiteam3.dumpgenerator.cli import Delay from wikiteam3.dumpgenerator.config import Config from wikiteam3.utils import removeIP -def saveSpecialVersion(config: Config, session: requests.Session): +def saveSpecialVersion(config: Config = None, session=None): """Save Special:Version as .html, to preserve extensions details""" if os.path.exists(f"{config.path}/SpecialVersion.html"): @@ -15,10 +13,10 @@ def saveSpecialVersion(config: Config, session: requests.Session): else: print("Downloading Special:Version with extensions and other related info") r = session.post( - url=config.index, params={"title": "Special:Version"}, timeout=10 # type: ignore + url=config.index, params={"title": "Special:Version"}, timeout=10 ) - raw = r.text - Delay(config=config) + raw = str(r.text) + Delay(config=config, session=session) raw = str(removeIP(raw=raw)) with open( f"{config.path}/SpecialVersion.html", "w", encoding="utf-8" diff --git a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml.py b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml.py index 59d9d6e..277b05f 100644 --- a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml.py +++ b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml.py @@ -1,13 +1,10 @@ -import requests - from wikiteam3.dumpgenerator.config import Config from .page_xml_api import getXMLPageWithApi from .page_xml_export import getXMLPageWithExport -# title="", verbose=True -def getXMLPage(config: Config, title: str, verbose: bool, session: requests.Session): +def getXMLPage(config: Config = None, title="", verbose=True, session=None): if config.xmlapiexport: return getXMLPageWithApi( config=config, title=title, verbose=verbose, session=session diff --git a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py index f6a158a..9e9b676 100644 --- a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py +++ b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_api.py @@ -1,7 +1,7 @@ import re import time import traceback -from typing import Dict +from typing import * import requests @@ -11,71 +11,58 @@ from wikiteam3.dumpgenerator.exceptions import ExportAbortedError, PageMissingEr from wikiteam3.dumpgenerator.log import logerror try: - import xml.etree.ElementTree as ElementTree + import xml.etree.ElementTree as ET except ImportError: - import xml.etree.ElementTree as ElementTree + import xml.etree.ElementTree as ET import xml.dom.minidom as MD -def reconstructRevisions(root: ElementTree.Element): - # print ElementTree.tostring(rev) - page = ElementTree.Element("stub") +def reconstructRevisions(root=None): + # print ET.tostring(rev) + page = ET.Element("stub") edits = 0 - - query: (ElementTree.Element | None) = root.find("query") - if query is None: - raise ValueError("query was none") - pages: (ElementTree.Element | None) = query.find("pages") - if pages is None: - raise ValueError("pages was none") - page_element: (ElementTree.Element | None) = query.find("page") - if page_element is None: - raise ValueError("page was none") - revisions: (ElementTree.Element | None) = page_element.find("revisions") - if revisions is None: - raise ValueError("revisions was none") - for rev in revisions.findall("rev"): + for rev in ( + root.find("query").find("pages").find("page").find("revisions").findall("rev") + ): try: - rev_ = ElementTree.SubElement(page, "revision") + rev_ = ET.SubElement(page, "revision") # id - ElementTree.SubElement(rev_, "id").text = rev.attrib["revid"] + ET.SubElement(rev_, "id").text = rev.attrib["revid"] # parentid (optional, export-0.7+) if "parentid" in rev.attrib: - ElementTree.SubElement(rev_, "parentid").text = rev.attrib["parentid"] + ET.SubElement(rev_, "parentid").text = rev.attrib["parentid"] # timestamp - ElementTree.SubElement(rev_, "timestamp").text = rev.attrib["timestamp"] + ET.SubElement(rev_, "timestamp").text = rev.attrib["timestamp"] # contributor - contributor = ElementTree.SubElement(rev_, "contributor") + contributor = ET.SubElement(rev_, "contributor") if "userhidden" not in rev.attrib: - ElementTree.SubElement(contributor, "username").text = rev.attrib[ - "user" - ] - ElementTree.SubElement(contributor, "id").text = rev.attrib["userid"] + ET.SubElement(contributor, "username").text = rev.attrib["user"] + ET.SubElement(contributor, "id").text = rev.attrib["userid"] else: contributor.set("deleted", "deleted") # comment (optional) if "commenthidden" in rev.attrib: print("commenthidden") - comment = ElementTree.SubElement(rev_, "comment") + comment = ET.SubElement(rev_, "comment") comment.set("deleted", "deleted") elif "comment" in rev.attrib and rev.attrib["comment"]: # '' is empty - comment = ElementTree.SubElement(rev_, "comment") + comment = ET.SubElement(rev_, "comment") comment.text = rev.attrib["comment"] # minor edit (optional) if "minor" in rev.attrib: - ElementTree.SubElement(rev_, "minor") + ET.SubElement(rev_, "minor") # model and format (optional, export-0.8+) if "contentmodel" in rev.attrib: - ElementTree.SubElement(rev_, "model").text = rev.attrib[ + ET.SubElement(rev_, "model").text = rev.attrib[ "contentmodel" ] # default: 'wikitext' if "contentformat" in rev.attrib: - ElementTree.SubElement(rev_, "format").text = rev.attrib[ + ET.SubElement(rev_, "format").text = rev.attrib[ "contentformat" ] # default: 'text/x-wiki' # text - text = ElementTree.SubElement(rev_, "text") + text = ET.SubElement(rev_, "text") if "texthidden" not in rev.attrib: text.attrib["xml:space"] = "preserve" text.attrib["bytes"] = rev.attrib["size"] @@ -85,28 +72,24 @@ def reconstructRevisions(root: ElementTree.Element): text.set("deleted", "deleted") # sha1 if "sha1" in rev.attrib: - sha1 = ElementTree.SubElement(rev_, "sha1") + sha1 = ET.SubElement(rev_, "sha1") sha1.text = rev.attrib["sha1"] elif "sha1hidden" in rev.attrib: - ElementTree.SubElement(rev_, "sha1") # stub + ET.SubElement(rev_, "sha1") # stub edits += 1 except Exception as e: - # logerror(config=config, text='Error reconstructing revision, xml:%s' % (ElementTree.tostring(rev))) - print(ElementTree.tostring(rev)) + # logerror(config=config, text='Error reconstructing revision, xml:%s' % (ET.tostring(rev))) + print(ET.tostring(rev)) traceback.print_exc() - page = None # type: ignore + page = None edits = 0 raise e return page, edits -# headers: Dict = None, params: Dict = None def getXMLPageCoreWithApi( - headers: Dict, - params: Dict[str, (str | int)], - config: Config, - session: requests.Session, + headers: Dict = None, params: Dict = None, config: Config = None, session=None ): """ """ # just send the API request @@ -118,7 +101,7 @@ def getXMLPageCoreWithApi( increment = 20 # increment every retry while not re.search( - r"" if config.curonly else r"", xml + r"" if not config.curonly else r"", xml ) or re.search(r"", xml): if c > 0 and c < maxretries: wait = ( @@ -131,8 +114,8 @@ def getXMLPageCoreWithApi( time.sleep(wait) # reducing server load requesting smallest chunks (if curonly then # rvlimit = 1 from mother function) - if int(params["rvlimit"]) > 1: - params["rvlimit"] = int(params["rvlimit"]) // 2 # half + if params["rvlimit"] > 1: + params["rvlimit"] = params["rvlimit"] / 2 # half if c >= maxretries: print(" We have retried %d times" % (c)) print( @@ -147,7 +130,7 @@ def getXMLPageCoreWithApi( print(" Saving in the errors log, and skipping...") logerror( config=config, - text=f'Error while retrieving the last revision of "{params["titles" if config.xmlapiexport else "pages"]}". Skipping.', # .decode("utf-8") + text=f'Error while retrieving the last revision of "{params["titles" if config.xmlapiexport else "pages"].decode("utf-8")}". Skipping.', ) raise ExportAbortedError(config.index) # FIXME HANDLE HTTP Errors HERE @@ -166,10 +149,7 @@ def getXMLPageCoreWithApi( return xml -# title="", verbose=True -def getXMLPageWithApi( - config: Config, title: str, verbose: bool, session: requests.Session -): +def getXMLPageWithApi(config: Config = None, title="", verbose=True, session=None): """Get the full history (or current only) of a page using API:Query if params['curonly'] is set, then using export&exportwrap to export """ @@ -190,52 +170,42 @@ def getXMLPageWithApi( "rvcontinue": None, "rvlimit": config.api_chunksize, } - firstpartok: bool = False - lastcontinue: str = "" + firstpartok = False + lastcontinue = None numberofedits = 0 ret = "" - continueKey: str = "" + continueKey: Optional[str] = None while True: # in case the last request is not right, saving last time's progress if not firstpartok: try: lastcontinue = params[continueKey] - except Exception: - lastcontinue = "" + except: + lastcontinue = None - xml = getXMLPageCoreWithApi( - headers={}, params=params, config=config, session=session - ) + xml = getXMLPageCoreWithApi(params=params, config=config, session=session) if xml == "": # just return so that we can continue, and getXMLPageCoreWithApi will log the error return try: - root = ElementTree.fromstring(xml.encode("utf-8")) - except Exception: + root = ET.fromstring(xml.encode("utf-8")) + except: continue try: - ret_query: (ElementTree.Element | None) = root.find("query") - if ret_query is None: - raise Exception("query was none") - ret_pages: (ElementTree.Element | None) = root.find("pages") - if ret_pages is None: - raise Exception("pages was none") - ret_page = ret_pages.find("page") - if ret_page is None: - continue - except Exception: + retpage = root.find("query").find("pages").find("page") + except: continue - if "missing" in ret_page.attrib or "invalid" in ret_page.attrib: + if "missing" in retpage.attrib or "invalid" in retpage.attrib: print("Page not found") raise PageMissingError(params["titles"], xml) if not firstpartok: try: # build the firstpart by ourselves to improve the memory usage ret = " \n" - ret += " %s\n" % (ret_page.attrib["title"]) - ret += " %s\n" % (ret_page.attrib["ns"]) - ret += " %s\n" % (ret_page.attrib["pageid"]) - except Exception: + ret += " %s\n" % (retpage.attrib["title"]) + ret += " %s\n" % (retpage.attrib["ns"]) + ret += " %s\n" % (retpage.attrib["pageid"]) + except: firstpartok = False continue else: @@ -243,34 +213,30 @@ def getXMLPageWithApi( yield ret continueVal = None - continue_element: (ElementTree.Element | None) = root.find("continue") - query_continue_element: (ElementTree.Element | None) = root.find( - "query-continue" - ) - if continue_element is not None: + if root.find("continue") is not None: # uses continue.rvcontinue # MW 1.26+ continueKey = "rvcontinue" - continueVal = continue_element.attrib["rvcontinue"] - elif query_continue_element is not None: - rev_continue = query_continue_element.find("revisions") - assert rev_continue is not None, "Should only have revisions continue" - if "rvcontinue" in rev_continue.attrib: + continueVal = root.find("continue").attrib["rvcontinue"] + elif root.find("query-continue") is not None: + revContinue = root.find("query-continue").find("revisions") + assert revContinue is not None, "Should only have revisions continue" + if "rvcontinue" in revContinue.attrib: # MW 1.21 ~ 1.25 continueKey = "rvcontinue" - continueVal = rev_continue.attrib["rvcontinue"] - elif "rvstartid" in rev_continue.attrib: + continueVal = revContinue.attrib["rvcontinue"] + elif "rvstartid" in revContinue.attrib: # TODO: MW ???? continueKey = "rvstartid" - continueVal = rev_continue.attrib["rvstartid"] + continueVal = revContinue.attrib["rvstartid"] else: # blindly assume the first attribute is the continue key # may never happen assert ( - len(rev_continue.attrib) > 0 + len(revContinue.attrib) > 0 ), "Should have at least one attribute" - for continueKey in rev_continue.attrib.keys(): - continueVal = rev_continue.attrib[continueKey] + for continueKey in revContinue.attrib.keys(): + continueVal = revContinue.attrib[continueKey] break if continueVal is not None: params[continueKey] = continueVal @@ -280,9 +246,7 @@ def getXMLPageWithApi( # transform the revision rev_, edits = reconstructRevisions(root=root) - xmldom = MD.parseString( - b"" + ElementTree.tostring(rev_) + b"" - ) + xmldom = MD.parseString(b"" + ET.tostring(rev_) + b"") # convert it into text in case it throws MemoryError # delete the first three line and last two line,which is for setting the indent ret += "".join(xmldom.toprettyxml(indent=" ").splitlines(True)[3:-2]) @@ -290,7 +254,7 @@ def getXMLPageWithApi( numberofedits += edits if config.curonly or continueVal is None: # no continue break - except Exception: + except: traceback.print_exc() params["rvcontinue"] = lastcontinue ret = "" @@ -303,9 +267,7 @@ def getXMLPageWithApi( "export": 1, "exportnowrap": 1, } - xml = getXMLPageCoreWithApi( - headers={}, params=params, config=config, session=session - ) + xml = getXMLPageCoreWithApi(params=params, config=config, session=session) if xml == "": raise ExportAbortedError(config.index) if "" not in xml: diff --git a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_export.py b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_export.py index 7d67f55..350dbd3 100644 --- a/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_export.py +++ b/wikiteam3/dumpgenerator/dump/page/xmlexport/page_xml_export.py @@ -1,7 +1,7 @@ import re import sys import time -from typing import Dict +from typing import * import requests @@ -12,12 +12,8 @@ from wikiteam3.dumpgenerator.log import logerror from wikiteam3.utils import uprint -# headers: Dict = None, params: Dict = None def getXMLPageCore( - headers: Dict, - params: Dict[str, (str | int)], - config: Config, - session: requests.Session, + headers: Dict = None, params: Dict = None, config: Config = None, session=None ) -> str: """""" # returns a XML containing params['limit'] revisions (or current only), ending in @@ -41,8 +37,8 @@ def getXMLPageCore( time.sleep(wait) # reducing server load requesting smallest chunks (if curonly then # limit = 1 from mother function) - if int(params["limit"]) > 1: - params["limit"] = int(params["limit"]) // 2 # half + if params["limit"] > 1: + params["limit"] = params["limit"] / 2 # half if c >= maxretries: print(" We have retried %d times" % (c)) print( @@ -56,9 +52,9 @@ def getXMLPageCore( # params['curonly'] should mean that we've already tried this # fallback, because it's set by the following if and passed to # getXMLPageCore - if not config.curonly: # and "curonly" not in params: + if not config.curonly and "curonly" not in params: print(" Trying to save only the last revision for this page...") - params["curonly"] = True + params["curonly"] = 1 logerror( config=config, to_stdout=True, @@ -79,7 +75,7 @@ def getXMLPageCore( try: r = session.post( url=config.index, params=params, headers=headers, timeout=10 - ) # type: ignore + ) handleStatusCode(r) xml = r.text except requests.exceptions.ConnectionError as e: @@ -93,9 +89,7 @@ def getXMLPageCore( return xml -def getXMLPageWithExport( - config: Config, title: str, verbose: bool, session: requests.Session -): +def getXMLPageWithExport(config: Config = None, title="", verbose=True, session=None): """Get the full history (or current only) of a page""" truncated = False @@ -103,17 +97,9 @@ def getXMLPageWithExport( title_ = re.sub(" ", "_", title_) # do not convert & into %26, title_ = re.sub('&', '%26', title_) if config.export: - params: Dict[str, (str | int)] = { - "title": config.export, - "pages": title_, - "action": "submit", - } + params = {"title": config.export, "pages": title_, "action": "submit"} else: - params = { - "title": "Special:Export", - "pages": title_, - "action": "submit", - } + params = {"title": "Special:Export", "pages": title_, "action": "submit"} if config.curonly: params["curonly"] = 1 params["limit"] = 1 @@ -128,7 +114,7 @@ def getXMLPageWithExport( if config.templates: params["templates"] = 1 - xml = getXMLPageCore(headers={}, params=params, config=config, session=session) + xml = getXMLPageCore(params=params, config=config, session=session) if xml == "": raise ExportAbortedError(config.index) if "" not in xml: @@ -153,12 +139,10 @@ def getXMLPageWithExport( # get the last timestamp from the acum XML params["offset"] = re.findall(r_timestamp, xml)[-1] try: - xml2 = getXMLPageCore( - headers={}, params=params, config=config, session=session - ) + xml2 = getXMLPageCore(params=params, config=config, session=session) except MemoryError: print("The page's history exceeds our memory, halving limit.") - params["limit"] = int(params["limit"]) // 2 + params["limit"] /= 2 continue # are there more edits in this next XML chunk or no ? @@ -193,7 +177,7 @@ def getXMLPageWithExport( ) except MemoryError: "The page's history exceeds our memory, halving limit." - params["limit"] = int(params["limit"]) // 2 + params["limit"] /= 2 continue xml = xml2 edit_count += len(re.findall(r_timestamp, xml)) diff --git a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py index 958072d..1af38c9 100644 --- a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py +++ b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions.py @@ -1,15 +1,14 @@ import sys import time -from typing import List +from datetime import datetime +from typing import * from urllib.parse import urlparse import lxml.etree import mwclient import requests -from lxml.etree import _ElementTree as ElementTree -from mwclient.errors import InvalidResponse, MwClientError -# from wikiteam3.dumpgenerator.api.namespaces import getNamespacesAPI +from wikiteam3.dumpgenerator.api.namespaces import getNamespacesAPI from wikiteam3.dumpgenerator.api.page_titles import readTitles from wikiteam3.dumpgenerator.config import Config from wikiteam3.dumpgenerator.dump.page.xmlrev.xml_revisions_page import ( @@ -23,8 +22,9 @@ ALL_NAMESPACE = -1 def getXMLRevisionsByAllRevisions( - config: Config, - site: mwclient.Site, # = None, + config: Config = None, + session=None, + site: mwclient.Site = None, nscontinue=None, arvcontinue=None, ): @@ -62,7 +62,55 @@ def getXMLRevisionsByAllRevisions( if _arvcontinue is not None: arvparams["arvcontinue"] = _arvcontinue - if config.curonly: + if not config.curonly: + # We have to build the XML manually... + # Skip flags, presumably needed to add which is in the schema. + # Also missing: parentid and contentformat. + arvparams[ + "arvprop" + ] = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags" + print( + "Trying to get wikitext from the allrevisions API and to build the XML" + ) + while True: + try: + arvrequest = site.api(http_method=config.http_method, **arvparams) + except requests.exceptions.HTTPError as e: + if e.response.status_code != 405 or config.http_method != "POST": + raise + print("POST request to the API failed, retrying with GET") + config.http_method = "GET" + continue + except requests.exceptions.ReadTimeout as err: + # Hopefully temporary, just wait a bit and continue with the same request. + # No point putting a limit to retries, we'd need to abort everything. + # TODO: reuse the retry logic of the checkAPI phase? Or force mwclient + # to use the retry adapter we use for our own requests session? + print(f"ERROR: {str(err)}") + print("Sleeping for 20 seconds") + time.sleep(20) + continue + except mwclient.errors.InvalidResponse as e: + if ( + not e.response_text.startswith("") + or config.http_method != "POST" + ): + raise + + print( + "POST request to the API failed (got HTML), retrying with GET" + ) + config.http_method = "GET" + continue + for page in arvrequest["query"]["allrevisions"]: + yield makeXmlFromPage(page, arvparams.get("arvcontinue", "")) + if "continue" in arvrequest: + arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"] + else: + # End of continuation. We are done with this namespace. + break + + else: # FIXME: this is not curonly, just different strategy to do all revisions # Just cycle through revision IDs and use the XML as is print("Trying to list the revisions and to export them one by one") @@ -141,69 +189,22 @@ def getXMLRevisionsByAllRevisions( ) except requests.exceptions.ReadTimeout as err: # As above - print(f"ERROR: {str(err)}\nSleeping for 20 seconds") - time.sleep(20) - # But avoid rewriting the same revisions - arvrequest["query"]["allrevisions"] = [] - - else: - # We have to build the XML manually... - # Skip flags, presumably needed to add which is in the schema. - # Also missing: parentid and contentformat. - arvparams[ - "arvprop" - ] = "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags" - print( - "Trying to get wikitext from the allrevisions API and to build the XML" - ) - while True: - try: - arvrequest = site.api(http_method=config.http_method, **arvparams) - except requests.exceptions.HTTPError as e: - if e.response.status_code != 405 or config.http_method != "POST": - raise - print("POST request to the API failed, retrying with GET") - config.http_method = "GET" - continue - except requests.exceptions.ReadTimeout as err: - # Hopefully temporary, just wait a bit and continue with the same request. - # No point putting a limit to retries, we'd need to abort everything. - # TODO: reuse the retry logic of the checkAPI phase? Or force mwclient - # to use the retry adapter we use for our own requests session? print(f"ERROR: {str(err)}") print("Sleeping for 20 seconds") time.sleep(20) - continue - except InvalidResponse as e: - if ( - e.response_text is not None - and not e.response_text.startswith("") - ) or config.http_method != "POST": - raise - - print( - "POST request to the API failed (got HTML), retrying with GET" - ) - config.http_method = "GET" - continue - for page in arvrequest["query"]["allrevisions"]: - yield makeXmlFromPage(page, arvparams.get("arvcontinue", "")) - if "continue" in arvrequest: - arvparams["arvcontinue"] = arvrequest["continue"]["arvcontinue"] - else: - # End of continuation. We are done with this namespace. - break + # But avoid rewriting the same revisions + arvrequest["query"]["allrevisions"] = [] def getXMLRevisionsByTitles( - config: Config, session: requests.Session, site: mwclient.Site, start: str + config: Config = None, session=None, site: mwclient.Site = None, start=None ): c = 0 if config.curonly: # The raw XML export in the API gets a title and gives the latest revision. # We could also use the allpages API as generator but let's be consistent. print("Getting titles to export the latest revision for each") - for title in readTitles(config, session=session, start=start, batch=False): + for title in readTitles(config, session=session, start=start): # TODO: respect verbose flag, reuse output from getXMLPage print(f" {title}") # TODO: as we're doing one page and revision at a time, we might @@ -237,7 +238,7 @@ def getXMLRevisionsByTitles( # The XML needs to be made manually because the export=1 option # refuses to return an arbitrary number of revisions (see above). print("Getting titles to export all the revisions of each") - titlelist: (str | List[str]) = [] + titlelist = [] # TODO: Decide a suitable number of a batched request. Careful: # batched responses may not return all revisions. for titlelist in readTitles(config, session=session, start=start, batch=False): @@ -247,11 +248,9 @@ def getXMLRevisionsByTitles( print(f" {title}") # Try and ask everything. At least on MediaWiki 1.16, uknown props are discarded: # "warnings":{"revisions":{"*":"Unrecognized values for parameter 'rvprop': userid, sha1, contentmodel"}}} - if titlelist is List: - titlelist = "|".join(titlelist) pparams = { "action": "query", - "titles": titlelist, + "titles": "|".join(titlelist), "prop": "revisions", "rvlimit": config.api_chunksize, "rvprop": "ids|timestamp|user|userid|size|sha1|contentmodel|comment|content|flags", @@ -264,13 +263,11 @@ def getXMLRevisionsByTitles( print("POST request to the API failed, retrying with GET") config.http_method = "GET" prequest = site.api(http_method=config.http_method, **pparams) - except InvalidResponse: - if titlelist is List: - titlelist = "; ".join(titlelist) + except mwclient.errors.InvalidResponse: logerror( config=config, to_stdout=True, - text=f"Error: page inaccessible? Could not export page: {titlelist}", + text=f'Error: page inaccessible? Could not export page: {"; ".join(titlelist)}', ) continue @@ -282,12 +279,10 @@ def getXMLRevisionsByTitles( try: pages = prequest["query"]["pages"] except KeyError: - if titlelist is List: - titlelist = "; ".join(titlelist) logerror( config=config, to_stdout=True, - text=f"Error: page inaccessible? Could not export page: {titlelist}", + text=f'Error: page inaccessible? Could not export page: {"; ".join(titlelist)}', ) break # Go through the data we got to build the XML. @@ -295,12 +290,10 @@ def getXMLRevisionsByTitles( try: yield makeXmlFromPage(pages[pageid], None) except PageMissingError: - if titlelist is List: - titlelist = "; ".join(titlelist) logerror( config=config, to_stdout=True, - text=f"Error: empty revision from API. Could not export page: {titlelist}", + text=f'Error: empty revision from API. Could not export page: {"; ".join(titlelist)}', ) continue @@ -331,12 +324,8 @@ def getXMLRevisionsByTitles( print(f"\n-> Downloaded {c} pages\n") -# useAllrevision=True, lastPage=None def getXMLRevisions( - config: Config, - session: requests.Session, - useAllrevision: bool, - lastPage: (ElementTree | None), + config: Config = None, session=None, useAllrevision=True, lastPage=None ): # FIXME: actually figure out the various strategies for each MediaWiki version apiurl = urlparse(config.api) @@ -353,7 +342,7 @@ def getXMLRevisions( # Find last title if lastPage is not None: try: - lastNs = int(lastPage.find("ns", None).text) + lastNs = int(lastPage.find("ns").text) lastArvcontinue = lastPage.attrib["arvcontinue"] except Exception: print( @@ -361,38 +350,43 @@ def getXMLRevisions( ) raise nscontinue = lastNs - arvcontinue = lastArvcontinue or None + arvcontinue = lastArvcontinue + if not arvcontinue: + arvcontinue = None else: nscontinue = None arvcontinue = None try: - return getXMLRevisionsByAllRevisions(config, site, nscontinue, arvcontinue) - except (KeyError, InvalidResponse) as e: + return getXMLRevisionsByAllRevisions( + config, session, site, nscontinue, arvcontinue + ) + except (KeyError, mwclient.errors.InvalidResponse) as e: + print(e) # TODO: check whether the KeyError was really for a missing arv API print( - f"{str(e)}/nWarning. Could not use allrevisions. Wiki too old? Try to use --xmlrevisions_page" + "Warning. Could not use allrevisions. Wiki too old? Try to use --xmlrevisions_page" ) sys.exit() else: # Find last title if lastPage is not None: try: - start = lastPage.find("title", None) + start = lastPage.find("title") except Exception: print( f"Failed to find title in last trunk XML: {lxml.etree.tostring(lastPage)}" ) raise else: - start = "" + start = None try: # # Uncomment these lines to raise an KeyError for testing # raise KeyError(999999) # # DO NOT UNCOMMMENT IN RELEASE return getXMLRevisionsByTitles(config, session, site, start) - except MwClientError as e: + except mwclient.errors.MwClientError as e: print(e) print("This mwclient version seems not to work for us. Exiting.") sys.exit() diff --git a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py index b57d03d..a249a26 100644 --- a/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py +++ b/wikiteam3/dumpgenerator/dump/page/xmlrev/xml_revisions_page.py @@ -6,7 +6,7 @@ from wikiteam3.dumpgenerator.exceptions import PageMissingError def makeXmlPageFromRaw(xml, arvcontinue) -> str: """Discard the metadata around a element in string""" - root = etree.XML(text=xml, parser=None) + root = etree.XML(xml) find = etree.XPath("//*[local-name() = 'page']") page = find(root)[0] if arvcontinue is not None: @@ -14,7 +14,7 @@ def makeXmlPageFromRaw(xml, arvcontinue) -> str: # The tag will inherit the namespace, like: # # FIXME: pretty_print doesn't seem to work, only adds a newline - return etree.tostring(page, pretty_print=True, encoding="unicode") # type: ignore + return etree.tostring(page, pretty_print=True, encoding="unicode") def makeXmlFromPage(page: dict, arvcontinue) -> str: @@ -124,4 +124,4 @@ def makeXmlFromPage(page: dict, arvcontinue) -> str: except KeyError as e: print(e) raise PageMissingError(page["title"], e) - return etree.tostring(p, pretty_print=True, encoding="unicode") # type: ignore + return etree.tostring(p, pretty_print=True, encoding="unicode") diff --git a/wikiteam3/dumpgenerator/dump/xmldump/xml_dump.py b/wikiteam3/dumpgenerator/dump/xmldump/xml_dump.py index d8a4654..991323d 100644 --- a/wikiteam3/dumpgenerator/dump/xmldump/xml_dump.py +++ b/wikiteam3/dumpgenerator/dump/xmldump/xml_dump.py @@ -1,12 +1,8 @@ import re import sys -from io import TextIOWrapper +from typing import * import lxml.etree -import requests - -# from typing import * -from lxml.etree import _ElementTree as ElementTree from wikiteam3.dumpgenerator.api.page_titles import readTitles from wikiteam3.dumpgenerator.cli import Delay @@ -23,14 +19,12 @@ from wikiteam3.dumpgenerator.log import logerror from wikiteam3.utils import cleanXML, domain2prefix, undoHTMLEntities -# lastPage=None, -# useAllrevisions=False, def doXMLRevisionDump( - config: Config, - session: requests.Session, - xmlfile: TextIOWrapper, - lastPage: (ElementTree | None), - useAllrevisions: bool, + config: Config = None, + session=None, + xmlfile=None, + lastPage=None, + useAllrevisions=False, ): try: r_timestamp = "([^<]+)" @@ -47,17 +41,16 @@ def doXMLRevisionDump( if arvcontinueRe := re.findall(r_arvcontinue, xml): curArvcontinue = arvcontinueRe[0] if lastArvcontinue != curArvcontinue: - Delay(config=config) + Delay(config=config, session=session) lastArvcontinue = curArvcontinue # Due to how generators work, it's expected this may be less xml = cleanXML(xml=xml) xmlfile.write(xml) xmltitle = re.search(r"([^<]+)", xml) - if xmltitle is not None: - title = undoHTMLEntities(text=xmltitle[1]) - print(f"{title}, {numrevs} edits (--xmlrevisions)") - # Delay(config=config) + title = undoHTMLEntities(text=xmltitle.group(1)) + print(f"{title}, {numrevs} edits (--xmlrevisions)") + # Delay(config=config, session=session) except AttributeError as e: print(e) print("This API library version is not working") @@ -66,13 +59,11 @@ def doXMLRevisionDump( print(e) -def doXMLExportDump( - config: Config, session: requests.Session, xmlfile: TextIOWrapper, lastPage=None -): +def doXMLExportDump(config: Config = None, session=None, xmlfile=None, lastPage=None): print("\nRetrieving the XML for every page\n") lock = True - start: str = "" + start = None if lastPage is not None: try: start = lastPage.find("title").text @@ -86,20 +77,18 @@ def doXMLExportDump( lock = False c = 1 - for title in readTitles(config, session=session, start=start, batch=False): - if title is not str or title == "": + for title in readTitles(config, session=session, start=start): + if not title: continue if title == start: # start downloading from start, included lock = False if lock: continue - Delay(config=config) + Delay(config=config, session=session) if c % 10 == 0: print(f"\n-> Downloaded {c} pages\n") try: - for xml in getXMLPage( - config=config, verbose=True, title=title, session=session - ): + for xml in getXMLPage(config=config, title=title, session=session): xml = cleanXML(xml=xml) xmlfile.write(xml) except PageMissingError: @@ -115,8 +104,7 @@ def doXMLExportDump( c += 1 -# resume=False -def generateXMLDump(config: Config, resume: bool, session: requests.Session): +def generateXMLDump(config: Config = None, resume=False, session=None): """Generates a XML dump for a list of titles or from revision IDs""" header, config = getXMLHeader(config=config, session=session) @@ -126,9 +114,9 @@ def generateXMLDump(config: Config, resume: bool, session: requests.Session): config.date, "current" if config.curonly else "history", ) - xmlfile: TextIOWrapper + xmlfile = None - lastPage: (ElementTree | None) = None + lastPage = None lastPageChunk = None # start != None, means we are resuming a XML dump if resume: @@ -140,9 +128,8 @@ def generateXMLDump(config: Config, resume: bool, session: requests.Session): resume = False lastPage = None else: - try: - lastPage = parseLastPageChunk(lastPageChunk) - except lxml.etree.LxmlError: + lastPage = parseLastPageChunk(lastPageChunk) + if lastPage is None: print("Failed to parse last page chunk: \n%s" % lastPageChunk) print("Cannot resume, exiting now!") sys.exit(1) diff --git a/wikiteam3/dumpgenerator/dump/xmldump/xml_header.py b/wikiteam3/dumpgenerator/dump/xmldump/xml_header.py index e95129a..f360243 100644 --- a/wikiteam3/dumpgenerator/dump/xmldump/xml_header.py +++ b/wikiteam3/dumpgenerator/dump/xmldump/xml_header.py @@ -1,8 +1,7 @@ -import contextlib import json import re import sys -from typing import Tuple +from typing import * import requests @@ -12,29 +11,31 @@ from wikiteam3.dumpgenerator.exceptions import ExportAbortedError, PageMissingEr from wikiteam3.dumpgenerator.log import logerror -def getXMLHeader(config: Config, session: requests.Session) -> Tuple[str, Config]: +def getXMLHeader(config: Config = None, session=None) -> Tuple[str, Config]: """Retrieve a random page to extract XML headers (namespace info, etc)""" print(config.api) xml = "" disableSpecialExport = config.xmlrevisions or config.xmlapiexport randomtitle = "Main_Page" if disableSpecialExport and config.api and config.api.endswith("api.php"): - with contextlib.suppress(requests.exceptions.RetryError): + try: print("Getting the XML header from the API") # Export and exportnowrap exist from MediaWiki 1.15, allpages from 1.8 r = session.get( f"{config.api}?action=query&export=1&exportnowrap=1&list=allpages&aplimit=1", timeout=10, ) - xml = r.text + xml: str = r.text # Otherwise try without exportnowrap, e.g. Wikia returns a blank page on 1.19 if not re.match(r"\s* Tuple[str, Config f"{config.api}?action=query&export=1&format=json&titles={randomtitle}", timeout=10, ) - with contextlib.suppress(KeyError): + try: xml = r.json()["query"]["export"]["*"] + except KeyError: + pass + except requests.exceptions.RetryError: + pass else: try: @@ -67,36 +72,36 @@ def getXMLHeader(config: Config, session: requests.Session) -> Tuple[str, Config # The does not exist. Not a problem, if we get the . xml = pme.xml except ExportAbortedError: - with contextlib.suppress(ExportAbortedError): - try: - if config.api: - print("Trying the local name for the Special namespace instead") - r = session.get( - url=config.api, - params={ - "action": "query", - "meta": "siteinfo", - "siprop": "namespaces", - "format": "json", - }, - timeout=120, - ) - config.export = ( - json.loads(r.text)["query"]["namespaces"]["-1"]["*"] - + ":Export" - ) - xml = "".join( - list( - getXMLPage( - config=config, - title=randomtitle, - verbose=False, - session=session, - ) + try: + if config.api: + print("Trying the local name for the Special namespace instead") + r = session.get( + url=config.api, + params={ + "action": "query", + "meta": "siteinfo", + "siprop": "namespaces", + "format": "json", + }, + timeout=120, + ) + config.export = ( + json.loads(r.text)["query"]["namespaces"]["-1"]["*"] + ":Export" + ) + xml = "".join( + list( + getXMLPage( + config=config, + title=randomtitle, + verbose=False, + session=session, ) ) - except PageMissingError as pme: - xml = pme.xml + ) + except PageMissingError as pme: + xml = pme.xml + except ExportAbortedError: + pass header = xml.split("")[0] if not re.match(r"\s* Tuple[str, Config print(xml) print("XML export on this wiki is broken, quitting.") logerror( - config=config, - to_stdout=True, - text="XML export on this wiki is broken, quitting.", + to_stdout=True, text="XML export on this wiki is broken, quitting." ) sys.exit() return header, config diff --git a/wikiteam3/dumpgenerator/dump/xmldump/xml_integrity.py b/wikiteam3/dumpgenerator/dump/xmldump/xml_integrity.py index 819ff29..5f17d15 100644 --- a/wikiteam3/dumpgenerator/dump/xmldump/xml_integrity.py +++ b/wikiteam3/dumpgenerator/dump/xmldump/xml_integrity.py @@ -1,10 +1,10 @@ -from typing import Iterable +from typing import * from wikiteam3.dumpgenerator.config import Config def checkXMLIntegrity( - config: Config, titles: (Iterable[str] | None) = None, session=None + config: Config = None, titles: Iterable[str] = None, session=None ): """Check XML dump integrity, to detect broken XML chunks""" # TODO: Fix XML Integrity Check diff --git a/wikiteam3/dumpgenerator/dump/xmldump/xml_truncate.py b/wikiteam3/dumpgenerator/dump/xmldump/xml_truncate.py index fe73be2..3cfb552 100644 --- a/wikiteam3/dumpgenerator/dump/xmldump/xml_truncate.py +++ b/wikiteam3/dumpgenerator/dump/xmldump/xml_truncate.py @@ -1,9 +1,9 @@ import os from io import StringIO +from typing import * import lxml.etree from file_read_backwards import FileReadBackwards -from lxml.etree import _ElementTree as ElementTree def endsWithNewlines(filename: str) -> int: @@ -60,9 +60,10 @@ def truncateXMLDump(filename: str) -> str: return incomplete_segment -def parseLastPageChunk(chunk) -> ElementTree: - parser = lxml.etree.XMLParser(recover=True) - tree = lxml.etree.parse(StringIO(chunk), parser) - return tree.getroot() - # except lxml.etree.LxmlError: - # return None +def parseLastPageChunk(chunk) -> Optional[lxml.etree._ElementTree]: + try: + parser = lxml.etree.XMLParser(recover=True) + tree = lxml.etree.parse(StringIO(chunk), parser) + return tree.getroot() + except lxml.etree.LxmlError: + return None diff --git a/wikiteam3/dumpgenerator/log/log_error.py b/wikiteam3/dumpgenerator/log/log_error.py index 5902ac9..7f18fbf 100644 --- a/wikiteam3/dumpgenerator/log/log_error.py +++ b/wikiteam3/dumpgenerator/log/log_error.py @@ -3,7 +3,7 @@ import datetime from wikiteam3.dumpgenerator.config import Config -def logerror(config: Config, to_stdout=False, text="") -> None: +def logerror(config: Config = None, to_stdout=False, text="") -> None: """Log error in errors.log""" if text: with open(f"{config.path}/errors.log", "a", encoding="utf-8") as outfile: diff --git a/wikiteam3/dumpgenerator/test/test_config.py b/wikiteam3/dumpgenerator/test/test_config.py index ce6521a..da9869e 100644 --- a/wikiteam3/dumpgenerator/test/test_config.py +++ b/wikiteam3/dumpgenerator/test/test_config.py @@ -25,7 +25,7 @@ def _new_config_from_parameter(params): def get_config(mediawiki_ver, api=True): - assert api == True # type: ignore + assert api == True if mediawiki_ver == "1.16.5": return _new_config_from_parameter( [ @@ -33,4 +33,3 @@ def get_config(mediawiki_ver, api=True): "http://group0.mediawiki.demo.save-web.org/mediawiki-1.16.5/api.php", ] ) - raise ValueError(f"Expected mediawiki_ver '1.16.5'; got {mediawiki_ver}") diff --git a/wikiteam3/gui.py b/wikiteam3/gui.py index a3cfb3d..e4f2cfe 100644 --- a/wikiteam3/gui.py +++ b/wikiteam3/gui.py @@ -22,7 +22,7 @@ TODO: * advanced: batch downloads, upload to Internet Archive or anywhere """ -import contextlib + import os import platform import random @@ -129,7 +129,7 @@ class App: self.button11 = Button( self.labelframe11, text="Check", - command=lambda: threading.start_new_threading(self.checkURL, ()), # type: ignore + command=lambda: threading.start_new_threading(self.checkURL, ()), width=5, ) self.button11.grid(row=0, column=3) @@ -275,14 +275,14 @@ class App: self.button21 = Button( self.frame2, text="Load available dumps", - command=lambda: threading.start_new_threading(self.loadAvailableDumps, ()), # type: ignore + command=lambda: threading.start_new_threading(self.loadAvailableDumps, ()), width=15, ) self.button21.grid(row=3, column=0) self.button23 = Button( self.frame2, text="Download selection", - command=lambda: threading.start_new_threading(self.downloadDump, ()), # type: ignore + command=lambda: threading.start_new_threading(self.downloadDump, ()), width=15, ) self.button23.grid(row=3, column=4) @@ -337,7 +337,7 @@ class App: ): # well-constructed URL?, one dot at least, aaaaa.com, but bb.aaaaa.com is allowed too if self.optionmenu11var.get() == "api.php": self.msg("Please wait... Checking api.php...") - if checkAPI(self.entry11.get(), None): # type: ignore + if checkAPI(self.entry11.get()): self.entry11.config(background="lightgreen") self.msg("api.php is correct!", level="ok") else: @@ -345,7 +345,7 @@ class App: self.msg("api.php is incorrect!", level="error") elif self.optionmenu11var.get() == "index.php": self.msg("Please wait... Checking index.php...") - if checkIndex(self.entry11.get(), None): # type: ignore + if checkIndex(self.entry11.get()): self.entry11.config(background="lightgreen") self.msg("index.php is OK!", level="ok") else: @@ -374,7 +374,7 @@ class App: def run(self): for _ in range(10): time.sleep(0.1) - self.value += 10 # type: ignore + self.value += 10 """ #get parameters selected @@ -388,7 +388,7 @@ class App: def msg(self, msg="", level=""): levels = {"ok": "lightgreen", "warning": "yellow", "error": "red"} - if level.lower() in levels: + if levels.has_key(level.lower()): print(f"{level.upper()}: {msg}") self.status.config( text=f"{level.upper()}: {msg}", background=levels[level.lower()] @@ -398,9 +398,9 @@ class App: self.status.config(text=msg, background="grey") def treeSortColumn(self, column, reverse=False): - line = [(self.tree.set(i, column), i) for i in self.tree.get_children("")] - line.sort(reverse=reverse) - for index, (val, i) in enumerate(line): + l = [(self.tree.set(i, column), i) for i in self.tree.get_children("")] + l.sort(reverse=reverse) + for index, (val, i) in enumerate(l): self.tree.move(i, "", index) self.tree.heading( column, @@ -408,7 +408,7 @@ class App: ) def downloadProgress(self, block_count, block_size, total_size): - with contextlib.suppress(Exception): + try: total_mb = total_size / 1024 / 1024.0 downloaded = block_count * (block_size / 1024 / 1024.0) percent = downloaded / (total_mb / 100.0) @@ -419,6 +419,8 @@ class App: self.msg(msg, level="ok") # sys.stdout.write("%.1f MB of %.1f MB downloaded (%.2f%%)" %(downloaded, total_mb, percent)) # sys.stdout.flush() + except: + pass def downloadDump(self, event=None): if self.block: @@ -450,7 +452,7 @@ class App: self.dumps[int(item)][5], ) ) - urllib.urlretrieve( # type: ignore + f = urllib.urlretrieve( self.dumps[int(item)][5], filepath, reporthook=self.downloadProgress, @@ -612,11 +614,11 @@ class App: ], ] wikifarms_r = re.compile(f'({"|".join(wikifarms.keys())})') - # c = 0 + c = 0 for mirror, url, regexp in self.urls: print("Loading data from", mirror, url) self.msg(msg=f"Please wait... Loading data from {mirror} {url}") - f = urllib.request.urlopen(url) # type: ignore + f = urllib.request.urlopen(url) m = re.compile(regexp).finditer(f.read()) for i in m: filename = i.group("filename") @@ -626,7 +628,9 @@ class App: if re.search(wikifarms_r, filename): wikifarm = re.findall(wikifarms_r, filename)[0] wikifarm = wikifarms[wikifarm] - size = i.group("size") or "Unknown" + size = i.group("size") + if not size: + size = "Unknown" date = "Unknown" if re.search(r"\-(\d{8})[\.-]", filename): date = re.findall(r"\-(\d{4})(\d{2})(\d{2})[\.-]", filename)[0] diff --git a/wikiteam3/uploader.py b/wikiteam3/uploader.py index cee7ae5..57ab0fe 100644 --- a/wikiteam3/uploader.py +++ b/wikiteam3/uploader.py @@ -15,9 +15,12 @@ # along with this program. If not, see . import argparse +import getopt import hashlib +import os import re import shutil +import subprocess import time import urllib.parse from io import BytesIO @@ -92,7 +95,6 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): prefix = domain2prefix(Config(api=wiki)) except KeyError: print("ERROR: could not produce the prefix for %s" % wiki) - continue wikiname = prefix.split("-")[0] dumps = [] @@ -161,29 +163,29 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): r = requests.get(url=wiki, params=params, headers=headers) if r.status_code < 400: xml = r.text - except requests.exceptions.ConnectionError: + except requests.exceptions.ConnectionError as e: pass sitename = "" baseurl = "" lang = "" try: - sitename = re.findall(r"sitename=\"([^\"]+)\"", xml)[0] # type: ignore - except Exception: + sitename = re.findall(r"sitename=\"([^\"]+)\"", xml)[0] + except: pass try: - baseurl = re.findall(r"base=\"([^\"]+)\"", xml)[0] # type: ignore - except Exception: + baseurl = re.findall(r"base=\"([^\"]+)\"", xml)[0] + except: pass try: - lang = re.findall(r"lang=\"([^\"]+)\"", xml)[0] # type: ignore - except Exception: + lang = re.findall(r"lang=\"([^\"]+)\"", xml)[0] + except: pass if not sitename: sitename = wikiname if not baseurl: - baseurl = re.sub(r"(?im)/api\.php", r"", wiki) # type: ignore + baseurl = re.sub(r"(?im)/api\.php", r"", wiki) # Convert protocol-relative URLs baseurl = re.sub("^//", "https://", baseurl) if lang: @@ -205,7 +207,7 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): r = requests.get(url=wiki, params=params, headers=headers) if r.status_code < 400: xml = r.text - except requests.exceptions.ConnectionError: + except requests.exceptions.ConnectionError as e: pass rightsinfourl = "" @@ -213,7 +215,7 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): try: rightsinfourl = re.findall(r"rightsinfo url=\"([^\"]+)\"", xml)[0] rightsinfotext = re.findall(r"text=\"([^\"]+)\"", xml)[0] - except Exception: + except: pass raw = "" @@ -221,7 +223,7 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): r = requests.get(url=baseurl, headers=headers) if r.status_code < 400: raw = r.text - except requests.exceptions.ConnectionError: + except requests.exceptions.ConnectionError as e: pass # or copyright info from #footer in mainpage @@ -233,13 +235,13 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): rightsinfourl = re.findall( r"", raw )[0] - except Exception: + except: pass try: rightsinfotext = re.findall( r"
  • ([^\n\r]*?)
  • ", raw )[0] - except Exception: + except: pass if rightsinfotext and not rightsinfourl: rightsinfourl = baseurl + "#footer" @@ -258,7 +260,7 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): if "http" not in logourl: # Probably a relative path, construct the absolute path logourl = urllib.parse.urljoin(wiki, logourl) - except Exception: + except: pass # retrieve some info from the wiki @@ -321,7 +323,7 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): try: item.upload( str(dump), - metadata=md, # type: ignore + metadata=md, access_key=ia_keys["access"], secret_key=ia_keys["secret"], verbose=True, @@ -339,14 +341,12 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): # Update metadata r = item.modify_metadata( - md, # type: ignore - access_key=ia_keys["access"], - secret_key=ia_keys["secret"], + md, access_key=ia_keys["access"], secret_key=ia_keys["secret"] ) - if r.status_code != 200: # type: ignore + if r.status_code != 200: print("Error when updating metadata") - print(r.status_code) # type: ignore - print(r.text) # type: ignore + print(r.status_code) + print(r.text) print( "You can find it in https://archive.org/details/%s" % (identifier) @@ -358,11 +358,11 @@ def upload(wikis, logfile, config={}, uploadeddumps=[]): try: log(logfile, wiki, dump, "ok") if logourl: - logo = BytesIO(requests.get(logourl, timeout=10).content) # type: ignore + logo = BytesIO(requests.get(logourl, timeout=10).content) if ".png" in logourl: logoextension = "png" - elif logourl.split("."): # type: ignore - logoextension = logourl.split(".")[-1] # type: ignore + elif logourl.split("."): + logoextension = logourl.split(".")[-1] else: logoextension = "unknown" logoname = "wiki-" + wikiname + "_logo." + logoextension @@ -410,11 +410,11 @@ Use --help to print this help.""" listfile = config.listfile try: uploadeddumps = [ - line.split(";")[1] - for line in open("uploader-%s.log" % (listfile)).read().strip().splitlines() - if len(line.split(";")) > 1 + l.split(";")[1] + for l in open("uploader-%s.log" % (listfile)).read().strip().splitlines() + if len(l.split(";")) > 1 ] - except Exception: + except: pass if config.logfile is None: diff --git a/wikiteam3/utils/__init__.py b/wikiteam3/utils/__init__.py index 518f689..f05f8ca 100644 --- a/wikiteam3/utils/__init__.py +++ b/wikiteam3/utils/__init__.py @@ -1,9 +1,7 @@ from .domain import domain2prefix -from .login import botLogin, clientLogin, indexLogin, uniLogin +from .login import botLogin, clientLogin, fetchLoginToken, indexLogin, uniLogin from .monkey_patch import mod_requests_text from .uprint import uprint from .user_agent import getUserAgent from .util import cleanHTML, cleanXML, removeIP, sha1File, undoHTMLEntities from .wiki_avoid import avoidWikimediaProjects - -__all__ = [domain2prefix, botLogin, clientLogin, indexLogin, uniLogin, mod_requests_text, uprint, getUserAgent, cleanHTML, cleanXML, removeIP, sha1File, undoHTMLEntities, avoidWikimediaProjects] # type: ignore diff --git a/wikiteam3/utils/domain.py b/wikiteam3/utils/domain.py index 8a230d8..aad0d05 100644 --- a/wikiteam3/utils/domain.py +++ b/wikiteam3/utils/domain.py @@ -3,7 +3,7 @@ import re from wikiteam3.dumpgenerator.config import Config -def domain2prefix(config: Config): +def domain2prefix(config: Config = None, session=None): """Convert domain name to a valid prefix filename.""" # At this point, both api and index are supposed to be defined diff --git a/wikiteam3/utils/login/__init__.py b/wikiteam3/utils/login/__init__.py index 0473413..f16f2bf 100644 --- a/wikiteam3/utils/login/__init__.py +++ b/wikiteam3/utils/login/__init__.py @@ -4,7 +4,7 @@ import time import requests -from wikiteam3.utils.login.api import botLogin, clientLogin +from wikiteam3.utils.login.api import botLogin, clientLogin, fetchLoginToken from wikiteam3.utils.login.index import indexLogin diff --git a/wikiteam3/utils/login/api.py b/wikiteam3/utils/login/api.py index d87da04..e1b1f4c 100644 --- a/wikiteam3/utils/login/api.py +++ b/wikiteam3/utils/login/api.py @@ -1,6 +1,6 @@ """ Available since MediaWiki 1.27. login to a wiki using username and password (API) """ -from typing import Optional +from typing import * import requests @@ -15,7 +15,8 @@ def fetchLoginToken(session: requests.Session, api: str) -> Optional[str]: data = response.json() try: token = data["query"]["tokens"]["logintoken"] - return token if type(token) is str else None + if type(token) is str: + return token except KeyError: print("fetch login token: Oops! Something went wrong -- ", data) return None diff --git a/wikiteam3/utils/login/index.py b/wikiteam3/utils/login/index.py index 202fe73..94d332f 100644 --- a/wikiteam3/utils/login/index.py +++ b/wikiteam3/utils/login/index.py @@ -1,7 +1,7 @@ """ Always available login methods.(mw 1.16-1.39) Even oler versions of MW may work, but not tested. """ -from typing import Optional +from typing import * import lxml.html import requests @@ -45,7 +45,7 @@ def indexLogin( "title": "Special:UserLogin", # introduced before MW 1.39. "force": "", # introduced before MW 1.39, empty string is OK. } - r = session.post(index, allow_redirects=False, params=params, data=data) # type: ignore + r = session.post(index, allow_redirects=False, params=params, data=data) if r.status_code == 302: print("index login: Success! Welcome, ", username, "!") return session diff --git a/wikiteam3/utils/monkey_patch.py b/wikiteam3/utils/monkey_patch.py index 2ad9323..6abda31 100644 --- a/wikiteam3/utils/monkey_patch.py +++ b/wikiteam3/utils/monkey_patch.py @@ -3,13 +3,13 @@ import requests from wikiteam3.dumpgenerator.cli.delay import Delay -def mod_requests_text(requests: requests): # type: ignore +def mod_requests_text(requests: requests): """Monkey patch `requests.Response.text` to remove BOM""" def new_text(self): return self.content.lstrip(b"\xef\xbb\xbf").decode(self.encoding) - requests.Response.text = property(new_text) # type: ignore + requests.Response.text = property(new_text) class DelaySession: @@ -26,8 +26,8 @@ class DelaySession: """Don't forget to call `release()`""" def new_send(request, **kwargs): - Delay(msg=self.msg, delay=self.delay, config=self.config) # type: ignore - return self.old_send(request, **kwargs) # type: ignore + Delay(msg=self.msg, delay=self.delay, config=self.config) + return self.old_send(request, **kwargs) self.old_send = self.session.send self.session.send = new_send diff --git a/wikiteam3/utils/user_agent.py b/wikiteam3/utils/user_agent.py index eef019e..dd1df20 100644 --- a/wikiteam3/utils/user_agent.py +++ b/wikiteam3/utils/user_agent.py @@ -319,10 +319,10 @@ def getUserAgent(): def setupUserAgent(session: requests.Session): - session._orirequest = session.request # type: ignore + session._orirequest = session.request def newrequest(*args, **kwargs): session.headers.update({"User-Agent": getUserAgent()}) - return session._orirequest(*args, **kwargs) # type: ignore + return session._orirequest(*args, **kwargs) - session.request = newrequest # type: ignore + session.request = newrequest diff --git a/wikiteam3/utils/wiki_avoid.py b/wikiteam3/utils/wiki_avoid.py index aed5641..c7593fd 100644 --- a/wikiteam3/utils/wiki_avoid.py +++ b/wikiteam3/utils/wiki_avoid.py @@ -1,11 +1,11 @@ import re import sys -from typing import Dict +from typing import * from wikiteam3.dumpgenerator.config import Config -def avoidWikimediaProjects(config: Config, other: Dict): +def avoidWikimediaProjects(config: Config = None, other: Dict = None): """Skip Wikimedia projects and redirect to the dumps website""" # notice about wikipedia dumps