Several confluence loader improvements (#3300)

This PR addresses several improvements:

- Previously it was not possible to load spaces of more than 100 pages.
The `limit` was being used both as an overall page limit *and* as a per
request pagination limit. This, in combination with the fact that
atlassian seem to use a server-side hard limit of 100 when page content
is expanded, meant it wasn't possible to download >100 pages. Now
`limit` is used *only* as a per-request pagination limit and `max_pages`
is introduced as the way to limit the total number of pages returned by
the paginator.
- Document metadata now includes `source` (the source url), making it
compatible with `RetrievalQAWithSourcesChain`.
 - It is now possible to include inline and footer comments.
- It is now possible to pass `verify_ssl=False` and other parameters to
the confluence object for use cases that require it.
This commit is contained in:
Luke Harris 2023-04-23 23:06:10 +01:00 committed by GitHub
parent 651cb62556
commit b4de839ed8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 87 additions and 24 deletions

View File

@ -60,6 +60,8 @@ class ConfluenceLoader(BaseLoader):
:type min_retry_seconds: Optional[int], optional
:param max_retry_seconds: defaults to 10
:type max_retry_seconds: Optional[int], optional
:param confluence_kwargs: additional kwargs to initialize confluence with
:type confluence_kwargs: dict, optional
:raises ValueError: Errors while validating input
:raises ImportError: Required dependencies not installed.
"""
@ -74,7 +76,9 @@ class ConfluenceLoader(BaseLoader):
number_of_retries: Optional[int] = 3,
min_retry_seconds: Optional[int] = 2,
max_retry_seconds: Optional[int] = 10,
confluence_kwargs: Optional[dict] = None,
):
confluence_kwargs = confluence_kwargs or {}
errors = ConfluenceLoader.validate_init_args(url, api_key, username, oauth2)
if errors:
raise ValueError(f"Error(s) while validating input: {errors}")
@ -93,10 +97,16 @@ class ConfluenceLoader(BaseLoader):
)
if oauth2:
self.confluence = Confluence(url=url, oauth2=oauth2, cloud=cloud)
self.confluence = Confluence(
url=url, oauth2=oauth2, cloud=cloud, **confluence_kwargs
)
else:
self.confluence = Confluence(
url=url, username=username, password=api_key, cloud=cloud
url=url,
username=username,
password=api_key,
cloud=cloud,
**confluence_kwargs,
)
@staticmethod
@ -147,7 +157,9 @@ class ConfluenceLoader(BaseLoader):
label: Optional[str] = None,
cql: Optional[str] = None,
include_attachments: bool = False,
include_comments: bool = False,
limit: Optional[int] = 50,
max_pages: Optional[int] = 1000,
) -> List[Document]:
"""
:param space_key: Space key retrieved from a confluence URL, defaults to None
@ -160,8 +172,12 @@ class ConfluenceLoader(BaseLoader):
:type cql: Optional[str], optional
:param include_attachments: defaults to False
:type include_attachments: bool, optional
:param limit: Maximum number of pages to retrieve, defaults to 50
:param include_comments: defaults to False
:type include_comments: bool, optional
:param limit: Maximum number of pages to retrieve per request, defaults to 50
:type limit: int, optional
:param max_pages: Maximum number of pages to retrieve in total, defaults 1000
:type max_pages: int, optional
:raises ValueError: _description_
:raises ImportError: _description_
:return: _description_
@ -191,10 +207,13 @@ class ConfluenceLoader(BaseLoader):
self.confluence.get_all_pages_from_space,
space=space_key,
limit=limit,
max_pages=max_pages,
expand="body.storage.value",
)
for page in pages:
doc = self.process_page(page, include_attachments, text_maker)
doc = self.process_page(
page, include_attachments, include_comments, text_maker
)
docs.append(doc)
if label:
@ -202,18 +221,27 @@ class ConfluenceLoader(BaseLoader):
self.confluence.get_all_pages_by_label,
label=label,
limit=limit,
max_pages=max_pages,
expand="body.storage.value",
)
for page in pages:
doc = self.process_page(page, include_attachments, text_maker)
doc = self.process_page(
page, include_attachments, include_comments, text_maker
)
docs.append(doc)
if cql:
pages = self.paginate_request(
self.confluence.cql, cql=cql, limit=limit, expand="body.storage.value"
self.confluence.cql,
cql=cql,
limit=limit,
max_pages=max_pages,
expand="body.storage.value",
)
for page in pages:
doc = self.process_page(page, include_attachments, text_maker)
doc = self.process_page(
page, include_attachments, include_comments, text_maker
)
docs.append(doc)
if page_ids:
@ -231,7 +259,9 @@ class ConfluenceLoader(BaseLoader):
before_sleep=before_sleep_log(logger, logging.WARNING),
)(self.confluence.get_page_by_id)
page = get_page(page_id=page_id, expand="body.storage.value")
doc = self.process_page(page, include_attachments, text_maker)
doc = self.process_page(
page, include_attachments, include_comments, text_maker
)
docs.append(doc)
return docs
@ -239,11 +269,13 @@ class ConfluenceLoader(BaseLoader):
def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
"""Paginate the various methods to retrieve groups of pages.
Unforunately, due to page size, sometimes the Confluence API
doesn't match the limit value. Also, due to the Atlassian Python
Unfortunately, due to page size, sometimes the Confluence API
doesn't match the limit value. If `limit` is >100 confluence
seems to cap the response to 100. Also, due to the Atlassian Python
package, we don't get the "next" values from the "_links" key because
they only return the value from the results key. So here, the pagination
starts from 0 and goes until the limit. We have to manually check if there
starts from 0 and goes until the max_pages, getting the `limit` number
of pages with each request. We have to manually check if there
are more docs based on the length of the returned list of pages, rather than
just checking for the presence of a `next` key in the response like this page
would have you do:
@ -255,10 +287,9 @@ class ConfluenceLoader(BaseLoader):
:rtype: List
"""
limit = kwargs["limit"]
page = 0
docs = []
while page < limit:
max_pages = kwargs.pop("max_pages")
docs: List[dict] = []
while len(docs) < max_pages:
get_pages = retry(
reraise=True,
stop=stop_after_attempt(
@ -271,16 +302,18 @@ class ConfluenceLoader(BaseLoader):
),
before_sleep=before_sleep_log(logger, logging.WARNING),
)(retrieval_method)
batch = get_pages(**kwargs, start=page)
if len(batch) < limit:
page = limit
else:
page += len(batch)
batch = get_pages(**kwargs, start=len(docs))
if not batch:
break
docs.extend(batch)
return docs
return docs[:max_pages]
def process_page(
self, page: dict, include_attachments: bool, text_maker: Any
self,
page: dict,
include_attachments: bool,
include_comments: bool,
text_maker: Any,
) -> Document:
if include_attachments:
attachment_texts = self.process_attachment(page["id"])
@ -289,8 +322,23 @@ class ConfluenceLoader(BaseLoader):
text = text_maker.handle(page["body"]["storage"]["value"]) + "".join(
attachment_texts
)
if include_comments:
comments = self.confluence.get_page_comments(
page["id"], expand="body.view.value", depth="all"
)["results"]
comment_texts = [
text_maker.handle(comment["body"]["view"]["value"])
for comment in comments
]
text = text + "".join(comment_texts)
return Document(
page_content=text, metadata={"title": page["title"], "id": page["id"]}
page_content=text,
metadata={
"title": page["title"],
"id": page["id"],
"source": self.base_url.strip("/") + page["_links"]["webui"],
},
)
def process_attachment(self, page_id: str) -> List[str]:

View File

@ -19,6 +19,10 @@ def test_load_single_confluence_page() -> None:
assert docs[0].page_content is not None
assert docs[0].metadata["id"] == "33189"
assert docs[0].metadata["title"] == "An easy intro to using Confluence"
assert docs[0].metadata["source"] == (
"https://templates.atlassian.net/wiki/"
"spaces/RD/pages/33189/An+easy+intro+to+using+Confluence"
)
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
@ -33,7 +37,18 @@ def test_load_full_confluence_space() -> None:
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
def test_confluence_pagination() -> None:
loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/")
docs = loader.load(space_key="RD", limit=5)
# this will issue 2 requests; each with a limit of 3 until the max_pages of 5 is met
docs = loader.load(space_key="RD", limit=3, max_pages=5)
assert len(docs) == 5
assert docs[0].page_content is not None
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
def test_pass_confluence_kwargs() -> None:
loader = ConfluenceLoader(
url="https://templates.atlassian.net/wiki/",
confluence_kwargs={"verify_ssl": False},
)
assert loader.confluence.verify_ssl is False