forked from Archives/langchain
Several confluence loader improvements (#3300)
This PR addresses several improvements: - Previously it was not possible to load spaces of more than 100 pages. The `limit` was being used both as an overall page limit *and* as a per request pagination limit. This, in combination with the fact that atlassian seem to use a server-side hard limit of 100 when page content is expanded, meant it wasn't possible to download >100 pages. Now `limit` is used *only* as a per-request pagination limit and `max_pages` is introduced as the way to limit the total number of pages returned by the paginator. - Document metadata now includes `source` (the source url), making it compatible with `RetrievalQAWithSourcesChain`. - It is now possible to include inline and footer comments. - It is now possible to pass `verify_ssl=False` and other parameters to the confluence object for use cases that require it.
This commit is contained in:
parent
651cb62556
commit
b4de839ed8
@ -60,6 +60,8 @@ class ConfluenceLoader(BaseLoader):
|
||||
:type min_retry_seconds: Optional[int], optional
|
||||
:param max_retry_seconds: defaults to 10
|
||||
:type max_retry_seconds: Optional[int], optional
|
||||
:param confluence_kwargs: additional kwargs to initialize confluence with
|
||||
:type confluence_kwargs: dict, optional
|
||||
:raises ValueError: Errors while validating input
|
||||
:raises ImportError: Required dependencies not installed.
|
||||
"""
|
||||
@ -74,7 +76,9 @@ class ConfluenceLoader(BaseLoader):
|
||||
number_of_retries: Optional[int] = 3,
|
||||
min_retry_seconds: Optional[int] = 2,
|
||||
max_retry_seconds: Optional[int] = 10,
|
||||
confluence_kwargs: Optional[dict] = None,
|
||||
):
|
||||
confluence_kwargs = confluence_kwargs or {}
|
||||
errors = ConfluenceLoader.validate_init_args(url, api_key, username, oauth2)
|
||||
if errors:
|
||||
raise ValueError(f"Error(s) while validating input: {errors}")
|
||||
@ -93,10 +97,16 @@ class ConfluenceLoader(BaseLoader):
|
||||
)
|
||||
|
||||
if oauth2:
|
||||
self.confluence = Confluence(url=url, oauth2=oauth2, cloud=cloud)
|
||||
self.confluence = Confluence(
|
||||
url=url, oauth2=oauth2, cloud=cloud, **confluence_kwargs
|
||||
)
|
||||
else:
|
||||
self.confluence = Confluence(
|
||||
url=url, username=username, password=api_key, cloud=cloud
|
||||
url=url,
|
||||
username=username,
|
||||
password=api_key,
|
||||
cloud=cloud,
|
||||
**confluence_kwargs,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
@ -147,7 +157,9 @@ class ConfluenceLoader(BaseLoader):
|
||||
label: Optional[str] = None,
|
||||
cql: Optional[str] = None,
|
||||
include_attachments: bool = False,
|
||||
include_comments: bool = False,
|
||||
limit: Optional[int] = 50,
|
||||
max_pages: Optional[int] = 1000,
|
||||
) -> List[Document]:
|
||||
"""
|
||||
:param space_key: Space key retrieved from a confluence URL, defaults to None
|
||||
@ -160,8 +172,12 @@ class ConfluenceLoader(BaseLoader):
|
||||
:type cql: Optional[str], optional
|
||||
:param include_attachments: defaults to False
|
||||
:type include_attachments: bool, optional
|
||||
:param limit: Maximum number of pages to retrieve, defaults to 50
|
||||
:param include_comments: defaults to False
|
||||
:type include_comments: bool, optional
|
||||
:param limit: Maximum number of pages to retrieve per request, defaults to 50
|
||||
:type limit: int, optional
|
||||
:param max_pages: Maximum number of pages to retrieve in total, defaults 1000
|
||||
:type max_pages: int, optional
|
||||
:raises ValueError: _description_
|
||||
:raises ImportError: _description_
|
||||
:return: _description_
|
||||
@ -191,10 +207,13 @@ class ConfluenceLoader(BaseLoader):
|
||||
self.confluence.get_all_pages_from_space,
|
||||
space=space_key,
|
||||
limit=limit,
|
||||
max_pages=max_pages,
|
||||
expand="body.storage.value",
|
||||
)
|
||||
for page in pages:
|
||||
doc = self.process_page(page, include_attachments, text_maker)
|
||||
doc = self.process_page(
|
||||
page, include_attachments, include_comments, text_maker
|
||||
)
|
||||
docs.append(doc)
|
||||
|
||||
if label:
|
||||
@ -202,18 +221,27 @@ class ConfluenceLoader(BaseLoader):
|
||||
self.confluence.get_all_pages_by_label,
|
||||
label=label,
|
||||
limit=limit,
|
||||
max_pages=max_pages,
|
||||
expand="body.storage.value",
|
||||
)
|
||||
for page in pages:
|
||||
doc = self.process_page(page, include_attachments, text_maker)
|
||||
doc = self.process_page(
|
||||
page, include_attachments, include_comments, text_maker
|
||||
)
|
||||
docs.append(doc)
|
||||
|
||||
if cql:
|
||||
pages = self.paginate_request(
|
||||
self.confluence.cql, cql=cql, limit=limit, expand="body.storage.value"
|
||||
self.confluence.cql,
|
||||
cql=cql,
|
||||
limit=limit,
|
||||
max_pages=max_pages,
|
||||
expand="body.storage.value",
|
||||
)
|
||||
for page in pages:
|
||||
doc = self.process_page(page, include_attachments, text_maker)
|
||||
doc = self.process_page(
|
||||
page, include_attachments, include_comments, text_maker
|
||||
)
|
||||
docs.append(doc)
|
||||
|
||||
if page_ids:
|
||||
@ -231,7 +259,9 @@ class ConfluenceLoader(BaseLoader):
|
||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||
)(self.confluence.get_page_by_id)
|
||||
page = get_page(page_id=page_id, expand="body.storage.value")
|
||||
doc = self.process_page(page, include_attachments, text_maker)
|
||||
doc = self.process_page(
|
||||
page, include_attachments, include_comments, text_maker
|
||||
)
|
||||
docs.append(doc)
|
||||
|
||||
return docs
|
||||
@ -239,11 +269,13 @@ class ConfluenceLoader(BaseLoader):
|
||||
def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
|
||||
"""Paginate the various methods to retrieve groups of pages.
|
||||
|
||||
Unforunately, due to page size, sometimes the Confluence API
|
||||
doesn't match the limit value. Also, due to the Atlassian Python
|
||||
Unfortunately, due to page size, sometimes the Confluence API
|
||||
doesn't match the limit value. If `limit` is >100 confluence
|
||||
seems to cap the response to 100. Also, due to the Atlassian Python
|
||||
package, we don't get the "next" values from the "_links" key because
|
||||
they only return the value from the results key. So here, the pagination
|
||||
starts from 0 and goes until the limit. We have to manually check if there
|
||||
starts from 0 and goes until the max_pages, getting the `limit` number
|
||||
of pages with each request. We have to manually check if there
|
||||
are more docs based on the length of the returned list of pages, rather than
|
||||
just checking for the presence of a `next` key in the response like this page
|
||||
would have you do:
|
||||
@ -255,10 +287,9 @@ class ConfluenceLoader(BaseLoader):
|
||||
:rtype: List
|
||||
"""
|
||||
|
||||
limit = kwargs["limit"]
|
||||
page = 0
|
||||
docs = []
|
||||
while page < limit:
|
||||
max_pages = kwargs.pop("max_pages")
|
||||
docs: List[dict] = []
|
||||
while len(docs) < max_pages:
|
||||
get_pages = retry(
|
||||
reraise=True,
|
||||
stop=stop_after_attempt(
|
||||
@ -271,16 +302,18 @@ class ConfluenceLoader(BaseLoader):
|
||||
),
|
||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||
)(retrieval_method)
|
||||
batch = get_pages(**kwargs, start=page)
|
||||
if len(batch) < limit:
|
||||
page = limit
|
||||
else:
|
||||
page += len(batch)
|
||||
batch = get_pages(**kwargs, start=len(docs))
|
||||
if not batch:
|
||||
break
|
||||
docs.extend(batch)
|
||||
return docs
|
||||
return docs[:max_pages]
|
||||
|
||||
def process_page(
|
||||
self, page: dict, include_attachments: bool, text_maker: Any
|
||||
self,
|
||||
page: dict,
|
||||
include_attachments: bool,
|
||||
include_comments: bool,
|
||||
text_maker: Any,
|
||||
) -> Document:
|
||||
if include_attachments:
|
||||
attachment_texts = self.process_attachment(page["id"])
|
||||
@ -289,8 +322,23 @@ class ConfluenceLoader(BaseLoader):
|
||||
text = text_maker.handle(page["body"]["storage"]["value"]) + "".join(
|
||||
attachment_texts
|
||||
)
|
||||
if include_comments:
|
||||
comments = self.confluence.get_page_comments(
|
||||
page["id"], expand="body.view.value", depth="all"
|
||||
)["results"]
|
||||
comment_texts = [
|
||||
text_maker.handle(comment["body"]["view"]["value"])
|
||||
for comment in comments
|
||||
]
|
||||
text = text + "".join(comment_texts)
|
||||
|
||||
return Document(
|
||||
page_content=text, metadata={"title": page["title"], "id": page["id"]}
|
||||
page_content=text,
|
||||
metadata={
|
||||
"title": page["title"],
|
||||
"id": page["id"],
|
||||
"source": self.base_url.strip("/") + page["_links"]["webui"],
|
||||
},
|
||||
)
|
||||
|
||||
def process_attachment(self, page_id: str) -> List[str]:
|
||||
|
@ -19,6 +19,10 @@ def test_load_single_confluence_page() -> None:
|
||||
assert docs[0].page_content is not None
|
||||
assert docs[0].metadata["id"] == "33189"
|
||||
assert docs[0].metadata["title"] == "An easy intro to using Confluence"
|
||||
assert docs[0].metadata["source"] == (
|
||||
"https://templates.atlassian.net/wiki/"
|
||||
"spaces/RD/pages/33189/An+easy+intro+to+using+Confluence"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
|
||||
@ -33,7 +37,18 @@ def test_load_full_confluence_space() -> None:
|
||||
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
|
||||
def test_confluence_pagination() -> None:
|
||||
loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/")
|
||||
docs = loader.load(space_key="RD", limit=5)
|
||||
# this will issue 2 requests; each with a limit of 3 until the max_pages of 5 is met
|
||||
docs = loader.load(space_key="RD", limit=3, max_pages=5)
|
||||
|
||||
assert len(docs) == 5
|
||||
assert docs[0].page_content is not None
|
||||
|
||||
|
||||
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
|
||||
def test_pass_confluence_kwargs() -> None:
|
||||
loader = ConfluenceLoader(
|
||||
url="https://templates.atlassian.net/wiki/",
|
||||
confluence_kwargs={"verify_ssl": False},
|
||||
)
|
||||
|
||||
assert loader.confluence.verify_ssl is False
|
||||
|
Loading…
Reference in New Issue
Block a user