From b4de839ed8a1bea7425a6923b2cd635068b6015a Mon Sep 17 00:00:00 2001 From: Luke Harris Date: Sun, 23 Apr 2023 23:06:10 +0100 Subject: [PATCH] Several confluence loader improvements (#3300) This PR addresses several improvements: - Previously it was not possible to load spaces of more than 100 pages. The `limit` was being used both as an overall page limit *and* as a per request pagination limit. This, in combination with the fact that atlassian seem to use a server-side hard limit of 100 when page content is expanded, meant it wasn't possible to download >100 pages. Now `limit` is used *only* as a per-request pagination limit and `max_pages` is introduced as the way to limit the total number of pages returned by the paginator. - Document metadata now includes `source` (the source url), making it compatible with `RetrievalQAWithSourcesChain`. - It is now possible to include inline and footer comments. - It is now possible to pass `verify_ssl=False` and other parameters to the confluence object for use cases that require it. --- langchain/document_loaders/confluence.py | 94 ++++++++++++++----- .../document_loaders/test_confluence.py | 17 +++- 2 files changed, 87 insertions(+), 24 deletions(-) diff --git a/langchain/document_loaders/confluence.py b/langchain/document_loaders/confluence.py index 56598e64..3ae97f93 100644 --- a/langchain/document_loaders/confluence.py +++ b/langchain/document_loaders/confluence.py @@ -60,6 +60,8 @@ class ConfluenceLoader(BaseLoader): :type min_retry_seconds: Optional[int], optional :param max_retry_seconds: defaults to 10 :type max_retry_seconds: Optional[int], optional + :param confluence_kwargs: additional kwargs to initialize confluence with + :type confluence_kwargs: dict, optional :raises ValueError: Errors while validating input :raises ImportError: Required dependencies not installed. """ @@ -74,7 +76,9 @@ class ConfluenceLoader(BaseLoader): number_of_retries: Optional[int] = 3, min_retry_seconds: Optional[int] = 2, max_retry_seconds: Optional[int] = 10, + confluence_kwargs: Optional[dict] = None, ): + confluence_kwargs = confluence_kwargs or {} errors = ConfluenceLoader.validate_init_args(url, api_key, username, oauth2) if errors: raise ValueError(f"Error(s) while validating input: {errors}") @@ -93,10 +97,16 @@ class ConfluenceLoader(BaseLoader): ) if oauth2: - self.confluence = Confluence(url=url, oauth2=oauth2, cloud=cloud) + self.confluence = Confluence( + url=url, oauth2=oauth2, cloud=cloud, **confluence_kwargs + ) else: self.confluence = Confluence( - url=url, username=username, password=api_key, cloud=cloud + url=url, + username=username, + password=api_key, + cloud=cloud, + **confluence_kwargs, ) @staticmethod @@ -147,7 +157,9 @@ class ConfluenceLoader(BaseLoader): label: Optional[str] = None, cql: Optional[str] = None, include_attachments: bool = False, + include_comments: bool = False, limit: Optional[int] = 50, + max_pages: Optional[int] = 1000, ) -> List[Document]: """ :param space_key: Space key retrieved from a confluence URL, defaults to None @@ -160,8 +172,12 @@ class ConfluenceLoader(BaseLoader): :type cql: Optional[str], optional :param include_attachments: defaults to False :type include_attachments: bool, optional - :param limit: Maximum number of pages to retrieve, defaults to 50 + :param include_comments: defaults to False + :type include_comments: bool, optional + :param limit: Maximum number of pages to retrieve per request, defaults to 50 :type limit: int, optional + :param max_pages: Maximum number of pages to retrieve in total, defaults 1000 + :type max_pages: int, optional :raises ValueError: _description_ :raises ImportError: _description_ :return: _description_ @@ -191,10 +207,13 @@ class ConfluenceLoader(BaseLoader): self.confluence.get_all_pages_from_space, space=space_key, limit=limit, + max_pages=max_pages, expand="body.storage.value", ) for page in pages: - doc = self.process_page(page, include_attachments, text_maker) + doc = self.process_page( + page, include_attachments, include_comments, text_maker + ) docs.append(doc) if label: @@ -202,18 +221,27 @@ class ConfluenceLoader(BaseLoader): self.confluence.get_all_pages_by_label, label=label, limit=limit, + max_pages=max_pages, expand="body.storage.value", ) for page in pages: - doc = self.process_page(page, include_attachments, text_maker) + doc = self.process_page( + page, include_attachments, include_comments, text_maker + ) docs.append(doc) if cql: pages = self.paginate_request( - self.confluence.cql, cql=cql, limit=limit, expand="body.storage.value" + self.confluence.cql, + cql=cql, + limit=limit, + max_pages=max_pages, + expand="body.storage.value", ) for page in pages: - doc = self.process_page(page, include_attachments, text_maker) + doc = self.process_page( + page, include_attachments, include_comments, text_maker + ) docs.append(doc) if page_ids: @@ -231,7 +259,9 @@ class ConfluenceLoader(BaseLoader): before_sleep=before_sleep_log(logger, logging.WARNING), )(self.confluence.get_page_by_id) page = get_page(page_id=page_id, expand="body.storage.value") - doc = self.process_page(page, include_attachments, text_maker) + doc = self.process_page( + page, include_attachments, include_comments, text_maker + ) docs.append(doc) return docs @@ -239,11 +269,13 @@ class ConfluenceLoader(BaseLoader): def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List: """Paginate the various methods to retrieve groups of pages. - Unforunately, due to page size, sometimes the Confluence API - doesn't match the limit value. Also, due to the Atlassian Python + Unfortunately, due to page size, sometimes the Confluence API + doesn't match the limit value. If `limit` is >100 confluence + seems to cap the response to 100. Also, due to the Atlassian Python package, we don't get the "next" values from the "_links" key because they only return the value from the results key. So here, the pagination - starts from 0 and goes until the limit. We have to manually check if there + starts from 0 and goes until the max_pages, getting the `limit` number + of pages with each request. We have to manually check if there are more docs based on the length of the returned list of pages, rather than just checking for the presence of a `next` key in the response like this page would have you do: @@ -255,10 +287,9 @@ class ConfluenceLoader(BaseLoader): :rtype: List """ - limit = kwargs["limit"] - page = 0 - docs = [] - while page < limit: + max_pages = kwargs.pop("max_pages") + docs: List[dict] = [] + while len(docs) < max_pages: get_pages = retry( reraise=True, stop=stop_after_attempt( @@ -271,16 +302,18 @@ class ConfluenceLoader(BaseLoader): ), before_sleep=before_sleep_log(logger, logging.WARNING), )(retrieval_method) - batch = get_pages(**kwargs, start=page) - if len(batch) < limit: - page = limit - else: - page += len(batch) + batch = get_pages(**kwargs, start=len(docs)) + if not batch: + break docs.extend(batch) - return docs + return docs[:max_pages] def process_page( - self, page: dict, include_attachments: bool, text_maker: Any + self, + page: dict, + include_attachments: bool, + include_comments: bool, + text_maker: Any, ) -> Document: if include_attachments: attachment_texts = self.process_attachment(page["id"]) @@ -289,8 +322,23 @@ class ConfluenceLoader(BaseLoader): text = text_maker.handle(page["body"]["storage"]["value"]) + "".join( attachment_texts ) + if include_comments: + comments = self.confluence.get_page_comments( + page["id"], expand="body.view.value", depth="all" + )["results"] + comment_texts = [ + text_maker.handle(comment["body"]["view"]["value"]) + for comment in comments + ] + text = text + "".join(comment_texts) + return Document( - page_content=text, metadata={"title": page["title"], "id": page["id"]} + page_content=text, + metadata={ + "title": page["title"], + "id": page["id"], + "source": self.base_url.strip("/") + page["_links"]["webui"], + }, ) def process_attachment(self, page_id: str) -> List[str]: diff --git a/tests/integration_tests/document_loaders/test_confluence.py b/tests/integration_tests/document_loaders/test_confluence.py index 211e165f..983bc254 100644 --- a/tests/integration_tests/document_loaders/test_confluence.py +++ b/tests/integration_tests/document_loaders/test_confluence.py @@ -19,6 +19,10 @@ def test_load_single_confluence_page() -> None: assert docs[0].page_content is not None assert docs[0].metadata["id"] == "33189" assert docs[0].metadata["title"] == "An easy intro to using Confluence" + assert docs[0].metadata["source"] == ( + "https://templates.atlassian.net/wiki/" + "spaces/RD/pages/33189/An+easy+intro+to+using+Confluence" + ) @pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed") @@ -33,7 +37,18 @@ def test_load_full_confluence_space() -> None: @pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed") def test_confluence_pagination() -> None: loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/") - docs = loader.load(space_key="RD", limit=5) + # this will issue 2 requests; each with a limit of 3 until the max_pages of 5 is met + docs = loader.load(space_key="RD", limit=3, max_pages=5) assert len(docs) == 5 assert docs[0].page_content is not None + + +@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed") +def test_pass_confluence_kwargs() -> None: + loader = ConfluenceLoader( + url="https://templates.atlassian.net/wiki/", + confluence_kwargs={"verify_ssl": False}, + ) + + assert loader.confluence.verify_ssl is False