Several confluence loader improvements (#3300)

This PR addresses several improvements:

- Previously it was not possible to load spaces of more than 100 pages.
The `limit` was being used both as an overall page limit *and* as a per
request pagination limit. This, in combination with the fact that
atlassian seem to use a server-side hard limit of 100 when page content
is expanded, meant it wasn't possible to download >100 pages. Now
`limit` is used *only* as a per-request pagination limit and `max_pages`
is introduced as the way to limit the total number of pages returned by
the paginator.
- Document metadata now includes `source` (the source url), making it
compatible with `RetrievalQAWithSourcesChain`.
 - It is now possible to include inline and footer comments.
- It is now possible to pass `verify_ssl=False` and other parameters to
the confluence object for use cases that require it.
fix_agent_callbacks
Luke Harris 1 year ago committed by GitHub
parent 651cb62556
commit b4de839ed8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -60,6 +60,8 @@ class ConfluenceLoader(BaseLoader):
:type min_retry_seconds: Optional[int], optional :type min_retry_seconds: Optional[int], optional
:param max_retry_seconds: defaults to 10 :param max_retry_seconds: defaults to 10
:type max_retry_seconds: Optional[int], optional :type max_retry_seconds: Optional[int], optional
:param confluence_kwargs: additional kwargs to initialize confluence with
:type confluence_kwargs: dict, optional
:raises ValueError: Errors while validating input :raises ValueError: Errors while validating input
:raises ImportError: Required dependencies not installed. :raises ImportError: Required dependencies not installed.
""" """
@ -74,7 +76,9 @@ class ConfluenceLoader(BaseLoader):
number_of_retries: Optional[int] = 3, number_of_retries: Optional[int] = 3,
min_retry_seconds: Optional[int] = 2, min_retry_seconds: Optional[int] = 2,
max_retry_seconds: Optional[int] = 10, max_retry_seconds: Optional[int] = 10,
confluence_kwargs: Optional[dict] = None,
): ):
confluence_kwargs = confluence_kwargs or {}
errors = ConfluenceLoader.validate_init_args(url, api_key, username, oauth2) errors = ConfluenceLoader.validate_init_args(url, api_key, username, oauth2)
if errors: if errors:
raise ValueError(f"Error(s) while validating input: {errors}") raise ValueError(f"Error(s) while validating input: {errors}")
@ -93,10 +97,16 @@ class ConfluenceLoader(BaseLoader):
) )
if oauth2: if oauth2:
self.confluence = Confluence(url=url, oauth2=oauth2, cloud=cloud) self.confluence = Confluence(
url=url, oauth2=oauth2, cloud=cloud, **confluence_kwargs
)
else: else:
self.confluence = Confluence( self.confluence = Confluence(
url=url, username=username, password=api_key, cloud=cloud url=url,
username=username,
password=api_key,
cloud=cloud,
**confluence_kwargs,
) )
@staticmethod @staticmethod
@ -147,7 +157,9 @@ class ConfluenceLoader(BaseLoader):
label: Optional[str] = None, label: Optional[str] = None,
cql: Optional[str] = None, cql: Optional[str] = None,
include_attachments: bool = False, include_attachments: bool = False,
include_comments: bool = False,
limit: Optional[int] = 50, limit: Optional[int] = 50,
max_pages: Optional[int] = 1000,
) -> List[Document]: ) -> List[Document]:
""" """
:param space_key: Space key retrieved from a confluence URL, defaults to None :param space_key: Space key retrieved from a confluence URL, defaults to None
@ -160,8 +172,12 @@ class ConfluenceLoader(BaseLoader):
:type cql: Optional[str], optional :type cql: Optional[str], optional
:param include_attachments: defaults to False :param include_attachments: defaults to False
:type include_attachments: bool, optional :type include_attachments: bool, optional
:param limit: Maximum number of pages to retrieve, defaults to 50 :param include_comments: defaults to False
:type include_comments: bool, optional
:param limit: Maximum number of pages to retrieve per request, defaults to 50
:type limit: int, optional :type limit: int, optional
:param max_pages: Maximum number of pages to retrieve in total, defaults 1000
:type max_pages: int, optional
:raises ValueError: _description_ :raises ValueError: _description_
:raises ImportError: _description_ :raises ImportError: _description_
:return: _description_ :return: _description_
@ -191,10 +207,13 @@ class ConfluenceLoader(BaseLoader):
self.confluence.get_all_pages_from_space, self.confluence.get_all_pages_from_space,
space=space_key, space=space_key,
limit=limit, limit=limit,
max_pages=max_pages,
expand="body.storage.value", expand="body.storage.value",
) )
for page in pages: for page in pages:
doc = self.process_page(page, include_attachments, text_maker) doc = self.process_page(
page, include_attachments, include_comments, text_maker
)
docs.append(doc) docs.append(doc)
if label: if label:
@ -202,18 +221,27 @@ class ConfluenceLoader(BaseLoader):
self.confluence.get_all_pages_by_label, self.confluence.get_all_pages_by_label,
label=label, label=label,
limit=limit, limit=limit,
max_pages=max_pages,
expand="body.storage.value", expand="body.storage.value",
) )
for page in pages: for page in pages:
doc = self.process_page(page, include_attachments, text_maker) doc = self.process_page(
page, include_attachments, include_comments, text_maker
)
docs.append(doc) docs.append(doc)
if cql: if cql:
pages = self.paginate_request( pages = self.paginate_request(
self.confluence.cql, cql=cql, limit=limit, expand="body.storage.value" self.confluence.cql,
cql=cql,
limit=limit,
max_pages=max_pages,
expand="body.storage.value",
) )
for page in pages: for page in pages:
doc = self.process_page(page, include_attachments, text_maker) doc = self.process_page(
page, include_attachments, include_comments, text_maker
)
docs.append(doc) docs.append(doc)
if page_ids: if page_ids:
@ -231,7 +259,9 @@ class ConfluenceLoader(BaseLoader):
before_sleep=before_sleep_log(logger, logging.WARNING), before_sleep=before_sleep_log(logger, logging.WARNING),
)(self.confluence.get_page_by_id) )(self.confluence.get_page_by_id)
page = get_page(page_id=page_id, expand="body.storage.value") page = get_page(page_id=page_id, expand="body.storage.value")
doc = self.process_page(page, include_attachments, text_maker) doc = self.process_page(
page, include_attachments, include_comments, text_maker
)
docs.append(doc) docs.append(doc)
return docs return docs
@ -239,11 +269,13 @@ class ConfluenceLoader(BaseLoader):
def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List: def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
"""Paginate the various methods to retrieve groups of pages. """Paginate the various methods to retrieve groups of pages.
Unforunately, due to page size, sometimes the Confluence API Unfortunately, due to page size, sometimes the Confluence API
doesn't match the limit value. Also, due to the Atlassian Python doesn't match the limit value. If `limit` is >100 confluence
seems to cap the response to 100. Also, due to the Atlassian Python
package, we don't get the "next" values from the "_links" key because package, we don't get the "next" values from the "_links" key because
they only return the value from the results key. So here, the pagination they only return the value from the results key. So here, the pagination
starts from 0 and goes until the limit. We have to manually check if there starts from 0 and goes until the max_pages, getting the `limit` number
of pages with each request. We have to manually check if there
are more docs based on the length of the returned list of pages, rather than are more docs based on the length of the returned list of pages, rather than
just checking for the presence of a `next` key in the response like this page just checking for the presence of a `next` key in the response like this page
would have you do: would have you do:
@ -255,10 +287,9 @@ class ConfluenceLoader(BaseLoader):
:rtype: List :rtype: List
""" """
limit = kwargs["limit"] max_pages = kwargs.pop("max_pages")
page = 0 docs: List[dict] = []
docs = [] while len(docs) < max_pages:
while page < limit:
get_pages = retry( get_pages = retry(
reraise=True, reraise=True,
stop=stop_after_attempt( stop=stop_after_attempt(
@ -271,16 +302,18 @@ class ConfluenceLoader(BaseLoader):
), ),
before_sleep=before_sleep_log(logger, logging.WARNING), before_sleep=before_sleep_log(logger, logging.WARNING),
)(retrieval_method) )(retrieval_method)
batch = get_pages(**kwargs, start=page) batch = get_pages(**kwargs, start=len(docs))
if len(batch) < limit: if not batch:
page = limit break
else:
page += len(batch)
docs.extend(batch) docs.extend(batch)
return docs return docs[:max_pages]
def process_page( def process_page(
self, page: dict, include_attachments: bool, text_maker: Any self,
page: dict,
include_attachments: bool,
include_comments: bool,
text_maker: Any,
) -> Document: ) -> Document:
if include_attachments: if include_attachments:
attachment_texts = self.process_attachment(page["id"]) attachment_texts = self.process_attachment(page["id"])
@ -289,8 +322,23 @@ class ConfluenceLoader(BaseLoader):
text = text_maker.handle(page["body"]["storage"]["value"]) + "".join( text = text_maker.handle(page["body"]["storage"]["value"]) + "".join(
attachment_texts attachment_texts
) )
if include_comments:
comments = self.confluence.get_page_comments(
page["id"], expand="body.view.value", depth="all"
)["results"]
comment_texts = [
text_maker.handle(comment["body"]["view"]["value"])
for comment in comments
]
text = text + "".join(comment_texts)
return Document( return Document(
page_content=text, metadata={"title": page["title"], "id": page["id"]} page_content=text,
metadata={
"title": page["title"],
"id": page["id"],
"source": self.base_url.strip("/") + page["_links"]["webui"],
},
) )
def process_attachment(self, page_id: str) -> List[str]: def process_attachment(self, page_id: str) -> List[str]:

@ -19,6 +19,10 @@ def test_load_single_confluence_page() -> None:
assert docs[0].page_content is not None assert docs[0].page_content is not None
assert docs[0].metadata["id"] == "33189" assert docs[0].metadata["id"] == "33189"
assert docs[0].metadata["title"] == "An easy intro to using Confluence" assert docs[0].metadata["title"] == "An easy intro to using Confluence"
assert docs[0].metadata["source"] == (
"https://templates.atlassian.net/wiki/"
"spaces/RD/pages/33189/An+easy+intro+to+using+Confluence"
)
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed") @pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
@ -33,7 +37,18 @@ def test_load_full_confluence_space() -> None:
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed") @pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
def test_confluence_pagination() -> None: def test_confluence_pagination() -> None:
loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/") loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/")
docs = loader.load(space_key="RD", limit=5) # this will issue 2 requests; each with a limit of 3 until the max_pages of 5 is met
docs = loader.load(space_key="RD", limit=3, max_pages=5)
assert len(docs) == 5 assert len(docs) == 5
assert docs[0].page_content is not None assert docs[0].page_content is not None
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
def test_pass_confluence_kwargs() -> None:
loader = ConfluenceLoader(
url="https://templates.atlassian.net/wiki/",
confluence_kwargs={"verify_ssl": False},
)
assert loader.confluence.verify_ssl is False

Loading…
Cancel
Save