diff --git a/.github/actions/poetry_setup/action.yml b/.github/actions/poetry_setup/action.yml new file mode 100644 index 00000000..3e2ef52f --- /dev/null +++ b/.github/actions/poetry_setup/action.yml @@ -0,0 +1,64 @@ +# An action for setting up poetry install with caching. +# Using a custom action since the default action does not +# take poetry install groups into account. +# Action code from: +# https://github.com/actions/setup-python/issues/505#issuecomment-1273013236 +name: poetry-install-with-caching +description: Poetry install with support for caching of dependency groups. + +inputs: + python-version: + description: Python version, supporting MAJOR.MINOR only + required: true + + poetry-version: + description: Poetry version + required: true + + install-command: + description: Command run for installing dependencies + required: false + default: poetry install + + cache-key: + description: Cache key to use for manual handling of caching + required: true + + working-directory: + description: Directory to run install-command in + required: false + default: "" + +runs: + using: composite + steps: + - uses: actions/setup-python@v4 + with: + python-version: ${{ inputs.python-version }} + + - uses: actions/cache@v3 + id: cache-pip + env: + SEGMENT_DOWNLOAD_TIMEOUT_MIN: "15" + with: + path: | + ~/.cache/pip + key: pip-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }} + + - run: pipx install poetry==${{ inputs.poetry-version }} --python python${{ inputs.python-version }} + shell: bash + + - uses: actions/cache@v3 + id: cache-poetry + env: + SEGMENT_DOWNLOAD_TIMEOUT_MIN: "15" + with: + path: | + ~/.cache/pypoetry/virtualenvs + ~/.cache/pypoetry/cache + ~/.cache/pypoetry/artifacts + key: poetry-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }}-poetry-${{ inputs.poetry-version }}-${{ inputs.cache-key }}-${{ hashFiles('poetry.lock') }} + + - run: ${{ inputs.install-command }} + working-directory: ${{ inputs.working-directory }} + shell: bash diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index ed46af8a..ec960bb4 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -20,15 +20,13 @@ jobs: - "3.11" steps: - uses: actions/checkout@v3 - - name: Install poetry - run: pipx install poetry==$POETRY_VERSION - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: "./.github/actions/poetry_setup" with: python-version: ${{ matrix.python-version }} - cache: "poetry" - - name: Install dependencies - run: poetry install + poetry-version: "1.4.2" + cache-key: "main" + install-command: "poetry install" - name: Run unit tests run: | make test diff --git a/.github/workflows/test_all.yml b/.github/workflows/test_all.yml new file mode 100644 index 00000000..d914a16f --- /dev/null +++ b/.github/workflows/test_all.yml @@ -0,0 +1,33 @@ +# Run unit tests with all optional packages installed. +name: test_all + +on: + push: + branches: [master] + pull_request: + +env: + POETRY_VERSION: "1.4.2" + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: + - "3.8" + - "3.9" + - "3.10" + - "3.11" + steps: + - uses: actions/checkout@v3 + - name: Set up Python ${{ matrix.python-version }} + uses: "./.github/actions/poetry_setup" + with: + python-version: ${{ matrix.python-version }} + poetry-version: "1.4.2" + cache-key: "extended" + install-command: "poetry install -E extended_testing" + - name: Run unit tests + run: | + make test diff --git a/poetry.lock b/poetry.lock index 0c0b0b72..d29d7498 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5362,6 +5362,27 @@ gcs = ["google-cloud-storage (>=1.26.0,<2.0.0)"] s3 = ["boto3"] test = ["mock", "pytest", "pytest-coverage", "typer-cli"] +[[package]] +name = "pdfminer-six" +version = "20221105" +description = "PDF parser and analyzer" +category = "main" +optional = true +python-versions = ">=3.6" +files = [ + {file = "pdfminer.six-20221105-py3-none-any.whl", hash = "sha256:1eaddd712d5b2732f8ac8486824533514f8ba12a0787b3d5fe1e686cd826532d"}, + {file = "pdfminer.six-20221105.tar.gz", hash = "sha256:8448ab7b939d18b64820478ecac5394f482d7a79f5f7eaa7703c6c959c175e1d"}, +] + +[package.dependencies] +charset-normalizer = ">=2.0.0" +cryptography = ">=36.0.0" + +[package.extras] +dev = ["black", "mypy (==0.931)", "nox", "pytest"] +docs = ["sphinx", "sphinx-argparse"] +image = ["Pillow"] + [[package]] name = "pexpect" version = "4.8.0" @@ -6223,6 +6244,101 @@ pyarrow = ">=10" [package.extras] tests = ["duckdb", "polars[pandas,pyarrow]", "pytest"] +[[package]] +name = "pymongo" +version = "4.3.3" +description = "Python driver for MongoDB " +category = "dev" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pymongo-4.3.3-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:74731c9e423c93cbe791f60c27030b6af6a948cef67deca079da6cd1bb583a8e"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux1_i686.whl", hash = "sha256:66413c50d510e5bcb0afc79880d1693a2185bcea003600ed898ada31338c004e"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:9b87b23570565a6ddaa9244d87811c2ee9cffb02a753c8a2da9c077283d85845"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_i686.whl", hash = "sha256:695939036a320f4329ccf1627edefbbb67cc7892b8222d297b0dd2313742bfee"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_ppc64le.whl", hash = "sha256:ffcc8394123ea8d43fff8e5d000095fe7741ce3f8988366c5c919c4f5eb179d3"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_s390x.whl", hash = "sha256:943f208840777f34312c103a2d1caab02d780c4e9be26b3714acf6c4715ba7e1"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux2014_x86_64.whl", hash = "sha256:01f7cbe88d22440b6594c955e37312d932fd632ffed1a86d0c361503ca82cc9d"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cdb87309de97c63cb9a69132e1cb16be470e58cffdfbad68fdd1dc292b22a840"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d86c35d94b5499689354ccbc48438a79f449481ee6300f3e905748edceed78e7"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a966d5304b7d90c45c404914e06bbf02c5bf7e99685c6c12f0047ef2aa837142"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:be1d2ce7e269215c3ee9a215e296b7a744aff4f39233486d2c4d77f5f0c561a6"}, + {file = "pymongo-4.3.3-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:55b6163dac53ef1e5d834297810c178050bd0548a4136cd4e0f56402185916ca"}, + {file = "pymongo-4.3.3-cp310-cp310-win32.whl", hash = "sha256:dc0cff74cd36d7e1edba91baa09622c35a8a57025f2f2b7a41e3f83b1db73186"}, + {file = "pymongo-4.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:cafa52873ae12baa512a8721afc20de67a36886baae6a5f394ddef0ce9391f91"}, + {file = "pymongo-4.3.3-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:599d3f6fbef31933b96e2d906b0f169b3371ff79ea6aaf6ecd76c947a3508a3d"}, + {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c0640b4e9d008e13956b004d1971a23377b3d45491f87082161c92efb1e6c0d6"}, + {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:341221e2f2866a5960e6f8610f4cbac0bb13097f3b1a289aa55aba984fc0d969"}, + {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e7fac06a539daef4fcf5d8288d0d21b412f9b750454cd5a3cf90484665db442a"}, + {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d3a51901066696c4af38c6c63a1f0aeffd5e282367ff475de8c191ec9609b56d"}, + {file = "pymongo-4.3.3-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f3055510fdfdb1775bc8baa359783022f70bb553f2d46e153c094dfcb08578ff"}, + {file = "pymongo-4.3.3-cp311-cp311-win32.whl", hash = "sha256:524d78673518dcd352a91541ecd2839c65af92dc883321c2109ef6e5cd22ef23"}, + {file = "pymongo-4.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:b8a03af1ce79b902a43f5f694c4ca8d92c2a4195db0966f08f266549e2fc49bc"}, + {file = "pymongo-4.3.3-cp37-cp37m-macosx_10_6_intel.whl", hash = "sha256:39b03045c71f761aee96a12ebfbc2f4be89e724ff6f5e31c2574c1a0e2add8bd"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux1_i686.whl", hash = "sha256:6fcfbf435eebf8a1765c6d1f46821740ebe9f54f815a05c8fc30d789ef43cb12"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:7d43ac9c7eeda5100fb0a7152fab7099c9cf9e5abd3bb36928eb98c7d7a339c6"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:3b93043b14ba7eb08c57afca19751658ece1cfa2f0b7b1fb5c7a41452fbb8482"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_i686.whl", hash = "sha256:c09956606c08c4a7c6178a04ba2dd9388fcc5db32002ade9c9bc865ab156ab6d"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_ppc64le.whl", hash = "sha256:b0cfe925610f2fd59555bb7fc37bd739e4b197d33f2a8b2fae7b9c0c6640318c"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:4d00b91c77ceb064c9b0459f0d6ea5bfdbc53ea9e17cf75731e151ef25a830c7"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:c6258a3663780ae47ba73d43eb63c79c40ffddfb764e09b56df33be2f9479837"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c29e758f0e734e1e90357ae01ec9c6daf19ff60a051192fe110d8fb25c62600e"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:12f3621a46cdc7a9ba8080422262398a91762a581d27e0647746588d3f995c88"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:47f7aa217b25833cd6f0e72b0d224be55393c2692b4f5e0561cb3beeb10296e9"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c2fdc855149efe7cdcc2a01ca02bfa24761c640203ea94df467f3baf19078be"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5effd87c7d363890259eac16c56a4e8da307286012c076223997f8cc4a8c435b"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:6dd1cf2995fdbd64fc0802313e8323f5fa18994d51af059b5b8862b73b5e53f0"}, + {file = "pymongo-4.3.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:bb869707d8e30645ed6766e44098600ca6cdf7989c22a3ea2b7966bb1d98d4b2"}, + {file = "pymongo-4.3.3-cp37-cp37m-win32.whl", hash = "sha256:49210feb0be8051a64d71691f0acbfbedc33e149f0a5d6e271fddf6a12493fed"}, + {file = "pymongo-4.3.3-cp37-cp37m-win_amd64.whl", hash = "sha256:54c377893f2cbbffe39abcff5ff2e917b082c364521fa079305f6f064e1a24a9"}, + {file = "pymongo-4.3.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:c184ec5be465c0319440734491e1aa4709b5f3ba75fdfc9dbbc2ae715a7f6829"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux1_i686.whl", hash = "sha256:dca34367a4e77fcab0693e603a959878eaf2351585e7d752cac544bc6b2dee46"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:cd6a4afb20fb3c26a7bfd4611a0bbb24d93cbd746f5eb881f114b5e38fd55501"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:0c466710871d0026c190fc4141e810cf9d9affbf4935e1d273fbdc7d7cda6143"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_i686.whl", hash = "sha256:d07d06dba5b5f7d80f9cc45501456e440f759fe79f9895922ed486237ac378a8"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_ppc64le.whl", hash = "sha256:711bc52cb98e7892c03e9b669bebd89c0a890a90dbc6d5bb2c47f30239bac6e9"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:34b040e095e1671df0c095ec0b04fc4ebb19c4c160f87c2b55c079b16b1a6b00"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux2014_x86_64.whl", hash = "sha256:4ed00f96e147f40b565fe7530d1da0b0f3ab803d5dd5b683834500fa5d195ec4"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ef888f48eb9203ee1e04b9fb27429017b290fb916f1e7826c2f7808c88798394"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:316498b642c00401370b2156b5233b256f9b33799e0a8d9d0b8a7da217a20fca"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:fa7e202feb683dad74f00dea066690448d0cfa310f8a277db06ec8eb466601b5"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:52896e22115c97f1c829db32aa2760b0d61839cfe08b168c2b1d82f31dbc5f55"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c051fe37c96b9878f37fa58906cb53ecd13dcb7341d3a85f1e2e2f6b10782d9"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:5134d33286c045393c7beb51be29754647cec5ebc051cf82799c5ce9820a2ca2"}, + {file = "pymongo-4.3.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:a9c2885b4a8e6e39db5662d8b02ca6dcec796a45e48c2de12552841f061692ba"}, + {file = "pymongo-4.3.3-cp38-cp38-win32.whl", hash = "sha256:a6cd6f1db75eb07332bd3710f58f5fce4967eadbf751bad653842750a61bda62"}, + {file = "pymongo-4.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:d5571b6978750601f783cea07fb6b666837010ca57e5cefa389c1d456f6222e2"}, + {file = "pymongo-4.3.3-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:81d1a7303bd02ca1c5be4aacd4db73593f573ba8e0c543c04c6da6275fd7a47e"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux1_i686.whl", hash = "sha256:016c412118e1c23fef3a1eada4f83ae6e8844fd91986b2e066fc1b0013cdd9ae"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:8fd6e191b92a10310f5a6cfe10d6f839d79d192fb02480bda325286bd1c7b385"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:e2961b05f9c04a53da8bfc72f1910b6aec7205fcf3ac9c036d24619979bbee4b"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_i686.whl", hash = "sha256:b38a96b3eed8edc515b38257f03216f382c4389d022a8834667e2bc63c0c0c31"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_ppc64le.whl", hash = "sha256:c1a70c51da9fa95bd75c167edb2eb3f3c4d27bc4ddd29e588f21649d014ec0b7"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:8a06a0c02f5606330e8f2e2f3b7949877ca7e4024fa2bff5a4506bec66c49ec7"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux2014_x86_64.whl", hash = "sha256:6c2216d8b6a6d019c6f4b1ad55f890e5e77eb089309ffc05b6911c09349e7474"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eac0a143ef4f28f49670bf89cb15847eb80b375d55eba401ca2f777cd425f338"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:08fc250b5552ee97ceeae0f52d8b04f360291285fc7437f13daa516ce38fdbc6"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704d939656e21b073bfcddd7228b29e0e8a93dd27b54240eaafc0b9a631629a6"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1074f1a6f23e28b983c96142f2d45be03ec55d93035b471c26889a7ad2365db3"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b16250238de8dafca225647608dddc7bbb5dce3dd53b4d8e63c1cc287394c2f"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:7761cacb8745093062695b11574effea69db636c2fd0a9269a1f0183712927b4"}, + {file = "pymongo-4.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.whl", hash = "sha256:fd7bb378d82b88387dc10227cfd964f6273eb083e05299e9b97cbe075da12d11"}, + {file = "pymongo-4.3.3-cp39-cp39-win32.whl", hash = "sha256:dc24d245026a72d9b4953729d31813edd4bd4e5c13622d96e27c284942d33f24"}, + {file = "pymongo-4.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:fc28e8d85d392a06434e9a934908d97e2cf453d69488d2bcd0bfb881497fd975"}, + {file = "pymongo-4.3.3.tar.gz", hash = "sha256:34e95ffb0a68bffbc3b437f2d1f25fc916fef3df5cdeed0992da5f42fae9b807"}, +] + +[package.dependencies] +dnspython = ">=1.16.0,<3.0.0" + +[package.extras] +aws = ["pymongo-auth-aws (<2.0.0)"] +encryption = ["pymongo-auth-aws (<2.0.0)", "pymongocrypt (>=1.3.0,<2.0.0)"] +gssapi = ["pykerberos"] +ocsp = ["certifi", "pyopenssl (>=17.2.0)", "requests (<3.0.0)", "service-identity (>=18.1.0)"] +snappy = ["python-snappy"] +zstd = ["zstandard"] + [[package]] name = "pyowm" version = "3.3.0" @@ -9686,6 +9802,7 @@ all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"] cohere = ["cohere"] embeddings = ["sentence-transformers"] +extended-testing = ["pdfminer-six", "pypdf"] llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] openai = ["openai"] qdrant = ["qdrant-client"] @@ -9693,4 +9810,4 @@ qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "2352db14ae75227c4d1ab34d48c74da3a16ceaeb5c5fa5df1a1dfcc5ae8e69e6" +content-hash = "63324f95fc04b48b631353c5e1a1a2ecc1b9dc757441181d4920e9d77cd8951f" diff --git a/pyproject.toml b/pyproject.toml index a4dc14ac..18e4ef7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,6 +77,7 @@ pexpect = {version = "^4.8.0", optional = true} pyvespa = {version = "^0.33.0", optional = true} O365 = {version = "^2.0.26", optional = true} jq = {version = "^1.4.1", optional = true} +pdfminer-six = {version = "^20221105", optional = true} [tool.poetry.group.docs.dependencies] @@ -161,6 +162,8 @@ cohere = ["cohere"] embeddings = ["sentence-transformers"] azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"] all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa", "O365", "jq"] +# An extra used to be able to add extended testing. +extended_testing = ["pypdf", "pdfminer.six"] [tool.ruff] select = [ diff --git a/tests/data.py b/tests/data.py new file mode 100644 index 00000000..228a9b21 --- /dev/null +++ b/tests/data.py @@ -0,0 +1,10 @@ +"""Module defines common test data.""" +from pathlib import Path + +_THIS_DIR = Path(__file__).parent + +_EXAMPLES_DIR = _THIS_DIR / "integration_tests" / "examples" + +# Paths to test PDF files +HELLO_PDF = _EXAMPLES_DIR / "hello.pdf" +LAYOUT_PARSER_PAPER_PDF = _EXAMPLES_DIR / "layout-parser-paper.pdf" diff --git a/tests/unit_tests/document_loader/parsers/test_pdf_parsers.py b/tests/unit_tests/document_loader/parsers/test_pdf_parsers.py new file mode 100644 index 00000000..4063cece --- /dev/null +++ b/tests/unit_tests/document_loader/parsers/test_pdf_parsers.py @@ -0,0 +1,79 @@ +"""Tests for the various PDF parsers.""" +from typing import Iterator + +import pytest + +from langchain.document_loaders.base import BaseBlobParser +from langchain.document_loaders.blob_loaders import Blob +from langchain.document_loaders.parsers.pdf import ( + PDFMinerParser, + PyMuPDFParser, + PyPDFium2Parser, + PyPDFParser, +) +from tests.data import HELLO_PDF, LAYOUT_PARSER_PAPER_PDF + + +def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None: + """Standard tests to verify that the given parser works. + + Args: + parser (BaseBlobParser): The parser to test. + splits_by_page (bool): Whether the parser splits by page or not by default. + """ + blob = Blob.from_path(HELLO_PDF) + doc_generator = parser.lazy_parse(blob) + assert isinstance(doc_generator, Iterator) + docs = list(doc_generator) + assert len(docs) == 1 + page_content = docs[0].page_content + assert isinstance(page_content, str) + # The different parsers return different amount of whitespace, so using + # startswith instead of equals. + assert docs[0].page_content.startswith("Hello world!") + + blob = Blob.from_path(LAYOUT_PARSER_PAPER_PDF) + doc_generator = parser.lazy_parse(blob) + assert isinstance(doc_generator, Iterator) + docs = list(doc_generator) + + if splits_by_page: + assert len(docs) == 16 + else: + assert len(docs) == 1 + # Test is imprecise since the parsers yield different parse information depending + # on configuration. Each parser seems to yield a slightly different result + # for this page! + assert "LayoutParser" in docs[0].page_content + metadata = docs[0].metadata + + assert metadata["source"] == str(LAYOUT_PARSER_PAPER_PDF) + + if splits_by_page: + assert metadata["page"] == 0 + + +@pytest.mark.requires("fitz") +def test_pymupdf_loader() -> None: + """Test PyMuPDF loader.""" + _assert_with_parser(PyMuPDFParser()) + + +@pytest.mark.requires("pypdf") +def test_pypdf_parser() -> None: + """Test PyPDF parser.""" + _assert_with_parser(PyPDFParser()) + + +@pytest.mark.requires("pdfminer") +def test_pdfminer_parser() -> None: + """Test PDFMiner parser.""" + # Does not follow defaults to split by page. + _assert_with_parser(PDFMinerParser(), splits_by_page=False) + + +@pytest.mark.requires("pypdfium2") +def test_pypdfium2_parser() -> None: + """Test PyPDFium2 parser.""" + # Does not follow defaults to split by page. + _assert_with_parser(PyPDFium2Parser()) diff --git a/tests/unit_tests/document_loader/parsers/test_public_api.py b/tests/unit_tests/document_loader/parsers/test_public_api.py new file mode 100644 index 00000000..52ce7e8e --- /dev/null +++ b/tests/unit_tests/document_loader/parsers/test_public_api.py @@ -0,0 +1,11 @@ +from langchain.document_loaders.parsers import __all__ + + +def test_parsers_public_api_correct() -> None: + """Test public API of parsers for breaking changes.""" + assert set(__all__) == { + "PyPDFParser", + "PDFMinerParser", + "PyMuPDFParser", + "PyPDFium2Parser", + }