forked from Archives/langchain
b4de839ed8
This PR addresses several improvements: - Previously it was not possible to load spaces of more than 100 pages. The `limit` was being used both as an overall page limit *and* as a per request pagination limit. This, in combination with the fact that atlassian seem to use a server-side hard limit of 100 when page content is expanded, meant it wasn't possible to download >100 pages. Now `limit` is used *only* as a per-request pagination limit and `max_pages` is introduced as the way to limit the total number of pages returned by the paginator. - Document metadata now includes `source` (the source url), making it compatible with `RetrievalQAWithSourcesChain`. - It is now possible to include inline and footer comments. - It is now possible to pass `verify_ssl=False` and other parameters to the confluence object for use cases that require it.
55 lines
1.9 KiB
Python
55 lines
1.9 KiB
Python
import pytest
|
|
|
|
from langchain.document_loaders.confluence import ConfluenceLoader
|
|
|
|
try:
|
|
from atlassian import Confluence # noqa: F401
|
|
|
|
confluence_installed = True
|
|
except ImportError:
|
|
confluence_installed = False
|
|
|
|
|
|
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
|
|
def test_load_single_confluence_page() -> None:
|
|
loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/")
|
|
docs = loader.load(page_ids=["33189"])
|
|
|
|
assert len(docs) == 1
|
|
assert docs[0].page_content is not None
|
|
assert docs[0].metadata["id"] == "33189"
|
|
assert docs[0].metadata["title"] == "An easy intro to using Confluence"
|
|
assert docs[0].metadata["source"] == (
|
|
"https://templates.atlassian.net/wiki/"
|
|
"spaces/RD/pages/33189/An+easy+intro+to+using+Confluence"
|
|
)
|
|
|
|
|
|
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
|
|
def test_load_full_confluence_space() -> None:
|
|
loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/")
|
|
docs = loader.load(space_key="RD")
|
|
|
|
assert len(docs) == 14
|
|
assert docs[0].page_content is not None
|
|
|
|
|
|
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
|
|
def test_confluence_pagination() -> None:
|
|
loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/")
|
|
# this will issue 2 requests; each with a limit of 3 until the max_pages of 5 is met
|
|
docs = loader.load(space_key="RD", limit=3, max_pages=5)
|
|
|
|
assert len(docs) == 5
|
|
assert docs[0].page_content is not None
|
|
|
|
|
|
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
|
|
def test_pass_confluence_kwargs() -> None:
|
|
loader = ConfluenceLoader(
|
|
url="https://templates.atlassian.net/wiki/",
|
|
confluence_kwargs={"verify_ssl": False},
|
|
)
|
|
|
|
assert loader.confluence.verify_ssl is False
|