diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 44934058..d4049452 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -40,5 +40,9 @@ jobs: fi - name: Run ${{matrix.test_type}} tests run: | - make test + if [ "${{ matrix.test_type }}" == "core" ]; then + make test + else + make extended_tests + fi shell: bash diff --git a/Makefile b/Makefile index 2ed86392..98efefe8 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: all clean format lint test tests test_watch integration_tests docker_tests help +.PHONY: all clean format lint test tests test_watch integration_tests docker_tests help extended_tests all: help @@ -40,6 +40,9 @@ test: tests: poetry run pytest $(TEST_FILE) +extended_tests: + poetry run pytest --only-extended tests/unit_tests + test_watch: poetry run ptw --now . -- tests/unit_tests @@ -59,7 +62,9 @@ help: @echo 'format - run code formatters' @echo 'lint - run linters' @echo 'test - run unit tests' + @echo 'test - run unit tests' @echo 'test TEST_FILE= - run all tests in file' + @echo 'extended_tests - run only extended unit tests' @echo 'test_watch - run unit tests in watch mode' @echo 'integration_tests - run integration tests' @echo 'docker_tests - run unit tests in docker' diff --git a/poetry.lock b/poetry.lock index 1a571324..58342d0c 100644 --- a/poetry.lock +++ b/poetry.lock @@ -419,7 +419,7 @@ name = "arxiv" version = "1.4.7" description = "Python wrapper for the arXiv API: http://arxiv.org/help/api/" category = "main" -optional = true +optional = false python-versions = ">=3.7" files = [ {file = "arxiv-1.4.7-py3-none-any.whl", hash = "sha256:22b8f610957bb6859a25fac9dc205ab6ba76d521791119a5762ea52625e398a0"}, @@ -1896,7 +1896,7 @@ name = "feedparser" version = "6.0.10" description = "Universal feed parser, handles RSS 0.9x, RSS 1.0, RSS 2.0, CDF, Atom 0.3, and Atom 1.0 feeds" category = "main" -optional = true +optional = false python-versions = ">=3.6" files = [ {file = "feedparser-6.0.10-py3-none-any.whl", hash = "sha256:79c257d526d13b944e965f6095700587f27388e50ea16fd245babe4dfae7024f"}, @@ -7656,7 +7656,7 @@ name = "sgmllib3k" version = "1.0.0" description = "Py3k port of sgmllib." category = "main" -optional = true +optional = false python-versions = "*" files = [ {file = "sgmllib3k-1.0.0.tar.gz", hash = "sha256:7868fb1c8bfa764c1ac563d3cf369c381d1325d36124933a726f29fcdaa812e9"}, @@ -9998,14 +9998,14 @@ all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"] cohere = ["cohere"] embeddings = ["sentence-transformers"] -extended-testing = ["pdfminer-six", "pypdf"] +extended-testing = ["pdfminer-six", "pypdf", "tqdm"] hnswlib = ["docarray", "hnswlib", "protobuf"] in-memory-store = ["docarray"] llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] -openai = ["openai"] +openai = ["openai", "tiktoken"] qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "62b7e066979d91e6baf921af79ac1fd0f44d9c0809b697dd511ac7c0fb3a09cc" +content-hash = "6d5c4aa06539e6f7c7531c30d73cbf08fbdea75486bf4b81c106b9e678a13b45" diff --git a/pyproject.toml b/pyproject.toml index 1d980c59..0bfc2b0a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -170,7 +170,9 @@ embeddings = ["sentence-transformers"] azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"] all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "protobuf", "hnswlib"] # An extra used to be able to add extended testing. -extended_testing = ["pypdf", "pdfminer.six"] +extended_testing = [ + "pypdf", "pdfminer.six", "tqdm" +] [tool.ruff] select = [ diff --git a/tests/unit_tests/document_loader/parsers/test_pdf_parsers.py b/tests/unit_tests/document_loader/parsers/test_pdf_parsers.py index 4063cece..7737ab93 100644 --- a/tests/unit_tests/document_loader/parsers/test_pdf_parsers.py +++ b/tests/unit_tests/document_loader/parsers/test_pdf_parsers.py @@ -7,8 +7,6 @@ from langchain.document_loaders.base import BaseBlobParser from langchain.document_loaders.blob_loaders import Blob from langchain.document_loaders.parsers.pdf import ( PDFMinerParser, - PyMuPDFParser, - PyPDFium2Parser, PyPDFParser, ) from tests.data import HELLO_PDF, LAYOUT_PARSER_PAPER_PDF @@ -53,12 +51,6 @@ def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> assert metadata["page"] == 0 -@pytest.mark.requires("fitz") -def test_pymupdf_loader() -> None: - """Test PyMuPDF loader.""" - _assert_with_parser(PyMuPDFParser()) - - @pytest.mark.requires("pypdf") def test_pypdf_parser() -> None: """Test PyPDF parser.""" @@ -70,10 +62,3 @@ def test_pdfminer_parser() -> None: """Test PDFMiner parser.""" # Does not follow defaults to split by page. _assert_with_parser(PDFMinerParser(), splits_by_page=False) - - -@pytest.mark.requires("pypdfium2") -def test_pypdfium2_parser() -> None: - """Test PyPDFium2 parser.""" - # Does not follow defaults to split by page. - _assert_with_parser(PyPDFium2Parser())