Add Confluence Loader unit tests (#3333)

Adds some basic unit tests for the ConfluenceLoader that can be extended
later. Ports this [PR from
llama-hub](https://github.com/emptycrown/llama-hub/pull/208) and adapts
it to `langchain`.

@Jflick58 and @zywilliamli adding you here as potential reviewers

---------

Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
This commit is contained in:
Adam Quigley 2023-05-17 08:17:07 +10:00 committed by GitHub
parent d126276693
commit e78c9be312
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 187 additions and 17 deletions

View File

@ -1,5 +1,6 @@
"""Load Data from a Confluence Space""" """Load Data from a Confluence Space"""
import logging import logging
from io import BytesIO
from typing import Any, Callable, List, Optional, Union from typing import Any, Callable, List, Optional, Union
from tenacity import ( from tenacity import (
@ -370,12 +371,10 @@ class ConfluenceLoader(BaseLoader):
def process_attachment(self, page_id: str) -> List[str]: def process_attachment(self, page_id: str) -> List[str]:
try: try:
import requests # noqa: F401
from PIL import Image # noqa: F401 from PIL import Image # noqa: F401
except ImportError: except ImportError:
raise ImportError( raise ImportError(
"`pytesseract` or `pdf2image` or `Pillow` package not found, " "`Pillow` package not found, " "please run `pip install Pillow`"
"please run `pip install pytesseract pdf2image Pillow`"
) )
# depending on setup you may also need to set the correct path for # depending on setup you may also need to set the correct path for
@ -419,9 +418,6 @@ class ConfluenceLoader(BaseLoader):
"please run `pip install pytesseract pdf2image`" "please run `pip install pytesseract pdf2image`"
) )
import pytesseract # noqa: F811
from pdf2image import convert_from_bytes # noqa: F811
response = self.confluence.request(path=link, absolute=True) response = self.confluence.request(path=link, absolute=True)
text = "" text = ""
@ -444,8 +440,6 @@ class ConfluenceLoader(BaseLoader):
def process_image(self, link: str) -> str: def process_image(self, link: str) -> str:
try: try:
from io import BytesIO # noqa: F401
import pytesseract # noqa: F401 import pytesseract # noqa: F401
from PIL import Image # noqa: F401 from PIL import Image # noqa: F401
except ImportError: except ImportError:
@ -472,8 +466,6 @@ class ConfluenceLoader(BaseLoader):
def process_doc(self, link: str) -> str: def process_doc(self, link: str) -> str:
try: try:
from io import BytesIO # noqa: F401
import docx2txt # noqa: F401 import docx2txt # noqa: F401
except ImportError: except ImportError:
raise ImportError( raise ImportError(
@ -522,17 +514,14 @@ class ConfluenceLoader(BaseLoader):
def process_svg(self, link: str) -> str: def process_svg(self, link: str) -> str:
try: try:
from io import BytesIO # noqa: F401
import pytesseract # noqa: F401 import pytesseract # noqa: F401
from PIL import Image # noqa: F401 from PIL import Image # noqa: F401
from reportlab.graphics import renderPM # noqa: F401 from reportlab.graphics import renderPM # noqa: F401
from reportlab.graphics.shapes import Drawing # noqa: F401
from svglib.svglib import svg2rlg # noqa: F401 from svglib.svglib import svg2rlg # noqa: F401
except ImportError: except ImportError:
raise ImportError( raise ImportError(
"`pytesseract`, `Pillow`, or `svglib` package not found, " "`pytesseract`, `Pillow`, `reportlab` or `svglib` package not found, "
"please run `pip install pytesseract Pillow svglib`" "please run `pip install pytesseract Pillow reportlab svglib`"
) )
response = self.confluence.request(path=link, absolute=True) response = self.confluence.request(path=link, absolute=True)

4
poetry.lock generated
View File

@ -10291,7 +10291,7 @@ all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api
azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"] azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"]
cohere = ["cohere"] cohere = ["cohere"]
embeddings = ["sentence-transformers"] embeddings = ["sentence-transformers"]
extended-testing = ["jq", "lxml", "pandas", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "telethon", "tqdm"] extended-testing = ["atlassian-python-api", "beautifulsoup4", "jq", "lxml", "pandas", "pdfminer-six", "pymupdf", "pypdf", "pypdfium2", "telethon", "tqdm"]
hnswlib = ["docarray", "hnswlib", "protobuf"] hnswlib = ["docarray", "hnswlib", "protobuf"]
in-memory-store = ["docarray"] in-memory-store = ["docarray"]
llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
@ -10301,4 +10301,4 @@ qdrant = ["qdrant-client"]
[metadata] [metadata]
lock-version = "2.0" lock-version = "2.0"
python-versions = ">=3.8.1,<4.0" python-versions = ">=3.8.1,<4.0"
content-hash = "b4cc0a605ec9b6ee8752f7d708a5700143815d32f699461ce6470ca44b62701a" content-hash = "18f77265eb5eb254f4fd308bdb4b53b2e2f7175fa2323af5112b9c62b00f4632"

View File

@ -192,6 +192,8 @@ extended_testing = [
"pypdfium2", "pypdfium2",
"tqdm", "tqdm",
"lxml", "lxml",
"atlassian-python-api",
"beautifulsoup4",
"pandas", "pandas",
"telethon", "telethon",
] ]

View File

@ -0,0 +1,179 @@
import unittest
from typing import Dict
from unittest.mock import MagicMock, patch
import pytest
from langchain.docstore.document import Document
from langchain.document_loaders.confluence import ConfluenceLoader
@pytest.fixture
def mock_confluence(): # type: ignore
with patch("atlassian.Confluence") as mock_confluence:
yield mock_confluence
@pytest.mark.requires("atlassian", "bs4", "lxml")
class TestConfluenceLoader:
CONFLUENCE_URL = "https://example.atlassian.com/wiki"
MOCK_USERNAME = "user@gmail.com"
MOCK_API_TOKEN = "api_token"
MOCK_SPACE_KEY = "spaceId123"
def test_confluence_loader_initialization(self, mock_confluence: MagicMock) -> None:
ConfluenceLoader(
url=self.CONFLUENCE_URL,
username=self.MOCK_USERNAME,
api_key=self.MOCK_API_TOKEN,
)
mock_confluence.assert_called_once_with(
url=self.CONFLUENCE_URL,
username="user@gmail.com",
password="api_token",
cloud=True,
)
def test_confluence_loader_initialization_from_env(
self, mock_confluence: MagicMock
) -> None:
with unittest.mock.patch.dict(
"os.environ",
{
"CONFLUENCE_USERNAME": self.MOCK_USERNAME,
"CONFLUENCE_API_TOKEN": self.MOCK_API_TOKEN,
},
):
ConfluenceLoader(url=self.CONFLUENCE_URL)
mock_confluence.assert_called_with(
url=self.CONFLUENCE_URL, username=None, password=None, cloud=True
)
def test_confluence_loader_load_data_invalid_args(self) -> None:
confluence_loader = ConfluenceLoader(
url=self.CONFLUENCE_URL,
username=self.MOCK_USERNAME,
api_key=self.MOCK_API_TOKEN,
)
with pytest.raises(
ValueError,
match="Must specify at least one among `space_key`, `page_ids`, `label`, `cql` parameters.", # noqa: E501
):
confluence_loader.load()
def test_confluence_loader_load_data_by_page_ids(
self, mock_confluence: MagicMock
) -> None:
mock_confluence.get_page_by_id.side_effect = [
self._get_mock_page("123"),
self._get_mock_page("456"),
]
mock_confluence.get_all_restrictions_for_content.side_effect = [
self._get_mock_page_restrictions("123"),
self._get_mock_page_restrictions("456"),
]
confluence_loader = self._get_mock_confluence_loader(mock_confluence)
mock_page_ids = ["123", "456"]
documents = confluence_loader.load(page_ids=mock_page_ids)
assert mock_confluence.get_page_by_id.call_count == 2
assert mock_confluence.get_all_restrictions_for_content.call_count == 2
assert len(documents) == 2
assert all(isinstance(doc, Document) for doc in documents)
assert documents[0].page_content == "Content 123"
assert documents[1].page_content == "Content 456"
assert mock_confluence.get_all_pages_from_space.call_count == 0
assert mock_confluence.get_all_pages_by_label.call_count == 0
assert mock_confluence.cql.call_count == 0
assert mock_confluence.get_page_child_by_type.call_count == 0
def test_confluence_loader_load_data_by_space_id(
self, mock_confluence: MagicMock
) -> None:
# one response with two pages
mock_confluence.get_all_pages_from_space.return_value = [
self._get_mock_page("123"),
self._get_mock_page("456"),
]
mock_confluence.get_all_restrictions_for_content.side_effect = [
self._get_mock_page_restrictions("123"),
self._get_mock_page_restrictions("456"),
]
confluence_loader = self._get_mock_confluence_loader(mock_confluence)
documents = confluence_loader.load(space_key=self.MOCK_SPACE_KEY, max_pages=2)
assert mock_confluence.get_all_pages_from_space.call_count == 1
assert len(documents) == 2
assert all(isinstance(doc, Document) for doc in documents)
assert documents[0].page_content == "Content 123"
assert documents[1].page_content == "Content 456"
assert mock_confluence.get_page_by_id.call_count == 0
assert mock_confluence.get_all_pages_by_label.call_count == 0
assert mock_confluence.cql.call_count == 0
assert mock_confluence.get_page_child_by_type.call_count == 0
def _get_mock_confluence_loader(
self, mock_confluence: MagicMock
) -> ConfluenceLoader:
confluence_loader = ConfluenceLoader(
url=self.CONFLUENCE_URL,
username=self.MOCK_USERNAME,
api_key=self.MOCK_API_TOKEN,
)
confluence_loader.confluence = mock_confluence
return confluence_loader
def _get_mock_page(self, page_id: str) -> Dict:
return {
"id": f"{page_id}",
"title": f"Page {page_id}",
"body": {"storage": {"value": f"<p>Content {page_id}</p>"}},
"status": "current",
"type": "page",
"_links": {
"self": f"{self.CONFLUENCE_URL}/rest/api/content/{page_id}",
"tinyui": "/x/tiny_ui_link",
"editui": f"/pages/resumedraft.action?draftId={page_id}",
"webui": f"/spaces/{self.MOCK_SPACE_KEY}/overview",
},
}
def _get_mock_page_restrictions(self, page_id: str) -> Dict:
return {
"read": {
"operation": "read",
"restrictions": {
"user": {"results": [], "start": 0, "limit": 200, "size": 0},
"group": {"results": [], "start": 0, "limit": 200, "size": 0},
},
"_expandable": {"content": f"/rest/api/content/{page_id}"},
"_links": {
"self": f"{self.CONFLUENCE_URL}/rest/api/content/{page_id}/restriction/byOperation/read" # noqa: E501
},
},
"update": {
"operation": "update",
"restrictions": {
"user": {"results": [], "start": 0, "limit": 200, "size": 0},
"group": {"results": [], "start": 0, "limit": 200, "size": 0},
},
"_expandable": {"content": f"/rest/api/content/{page_id}"},
"_links": {
"self": f"{self.CONFLUENCE_URL}/rest/api/content/{page_id}/restriction/byOperation/update" # noqa: E501
},
},
"_links": {
"self": f"{self.CONFLUENCE_URL}/rest/api/content/{page_id}/restriction/byOperation", # noqa: E501
"base": self.CONFLUENCE_URL,
"context": "/wiki",
},
}