Several confluence loader improvements (#3300)

This PR addresses several improvements: - Previously it was not possible to load spaces of more than 100 pages. The `limit` was being used both as an overall page limit *and* as a per request pagination limit. This, in combination with the fact that atlassian seem to use a server-side hard limit of 100 when page content is expanded, meant it wasn't possible to download >100 pages. Now `limit` is used *only* as a per-request pagination limit and `max_pages` is introduced as the way to limit the total number of pages returned by the paginator. - Document metadata now includes `source` (the source url), making it compatible with `RetrievalQAWithSourcesChain`. - It is now possible to include inline and footer comments. - It is now possible to pass `verify_ssl=False` and other parameters to the confluence object for use cases that require it.
1 year ago · b4de839ed8
parent 651cb62556
commit b4de839ed8
2 changed files with 87 additions and 24 deletions
--- a/langchain/document_loaders/confluence.py
+++ b/langchain/document_loaders/confluence.py
@ -60,6 +60,8 @@ class ConfluenceLoader(BaseLoader):
    :type min_retry_seconds: Optional[int], optional
    :param max_retry_seconds:  defaults to 10
    :type max_retry_seconds: Optional[int], optional
    :param confluence_kwargs: additional kwargs to initialize confluence with
    :type confluence_kwargs: dict, optional
    :raises ValueError: Errors while validating input
    :raises ImportError: Required dependencies not installed.
    """
@ -74,7 +76,9 @@ class ConfluenceLoader(BaseLoader):
        number_of_retries: Optional[int] = 3,
        min_retry_seconds: Optional[int] = 2,
        max_retry_seconds: Optional[int] = 10,
        confluence_kwargs: Optional[dict] = None,
    ):
        confluence_kwargs = confluence_kwargs or {}
        errors = ConfluenceLoader.validate_init_args(url, api_key, username, oauth2)
        if errors:
            raise ValueError(f"Error(s) while validating input: {errors}")
@ -93,10 +97,16 @@ class ConfluenceLoader(BaseLoader):
            )
        if oauth2:
-            self.confluence = Confluence(url=url, oauth2=oauth2, cloud=cloud)
+            self.confluence = Confluence(
                url=url, oauth2=oauth2, cloud=cloud, **confluence_kwargs
            )
        else:
            self.confluence = Confluence(
-                url=url, username=username, password=api_key, cloud=cloud
+                url=url,
                username=username,
                password=api_key,
                cloud=cloud,
                **confluence_kwargs,
            )
    @staticmethod
@ -147,7 +157,9 @@ class ConfluenceLoader(BaseLoader):
        label: Optional[str] = None,
        cql: Optional[str] = None,
        include_attachments: bool = False,
        include_comments: bool = False,
        limit: Optional[int] = 50,
        max_pages: Optional[int] = 1000,
    ) -> List[Document]:
        """
        :param space_key: Space key retrieved from a confluence URL, defaults to None
@ -160,8 +172,12 @@ class ConfluenceLoader(BaseLoader):
        :type cql: Optional[str], optional
        :param include_attachments: defaults to False
        :type include_attachments: bool, optional
-        :param limit: Maximum number of pages to retrieve, defaults to 50
+        :param include_comments: defaults to False
        :type include_comments: bool, optional
        :param limit: Maximum number of pages to retrieve per request, defaults to 50
        :type limit: int, optional
        :param max_pages: Maximum number of pages to retrieve in total, defaults 1000
        :type max_pages: int, optional
        :raises ValueError: _description_
        :raises ImportError: _description_
        :return: _description_
@ -191,10 +207,13 @@ class ConfluenceLoader(BaseLoader):
                self.confluence.get_all_pages_from_space,
                space=space_key,
                limit=limit,
                max_pages=max_pages,
                expand="body.storage.value",
            )
            for page in pages:
-                doc = self.process_page(page, include_attachments, text_maker)
+                doc = self.process_page(
                    page, include_attachments, include_comments, text_maker
                )
                docs.append(doc)
        if label:
@ -202,18 +221,27 @@ class ConfluenceLoader(BaseLoader):
                self.confluence.get_all_pages_by_label,
                label=label,
                limit=limit,
                max_pages=max_pages,
                expand="body.storage.value",
            )
            for page in pages:
-                doc = self.process_page(page, include_attachments, text_maker)
+                doc = self.process_page(
                    page, include_attachments, include_comments, text_maker
                )
                docs.append(doc)
        if cql:
            pages = self.paginate_request(
-                self.confluence.cql, cql=cql, limit=limit, expand="body.storage.value"
+                self.confluence.cql,
                cql=cql,
                limit=limit,
                max_pages=max_pages,
                expand="body.storage.value",
            )
            for page in pages:
-                doc = self.process_page(page, include_attachments, text_maker)
+                doc = self.process_page(
                    page, include_attachments, include_comments, text_maker
                )
                docs.append(doc)
        if page_ids:
@ -231,7 +259,9 @@ class ConfluenceLoader(BaseLoader):
                    before_sleep=before_sleep_log(logger, logging.WARNING),
                )(self.confluence.get_page_by_id)
                page = get_page(page_id=page_id, expand="body.storage.value")
-                doc = self.process_page(page, include_attachments, text_maker)
+                doc = self.process_page(
                    page, include_attachments, include_comments, text_maker
                )
                docs.append(doc)
        return docs
@ -239,11 +269,13 @@ class ConfluenceLoader(BaseLoader):
    def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
        """Paginate the various methods to retrieve groups of pages.
-        Unforunately, due to page size, sometimes the Confluence API
+        Unfortunately, due to page size, sometimes the Confluence API
-        doesn't match the limit value. Also, due to the Atlassian Python
+        doesn't match the limit value. If `limit` is  >100 confluence
        seems to cap the response to 100. Also, due to the Atlassian Python
        package, we don't get the "next" values from the "_links" key because
        they only return the value from the results key. So here, the pagination
-        starts from 0 and goes until the limit. We have to manually check if there
+        starts from 0 and goes until the max_pages, getting the `limit` number
        of pages with each request. We have to manually check if there
        are more docs based on the length of the returned list of pages, rather than
        just checking for the presence of a `next` key in the response like this page
        would have you do:
@ -255,10 +287,9 @@ class ConfluenceLoader(BaseLoader):
        :rtype: List
        """
-        limit = kwargs["limit"]
+        max_pages = kwargs.pop("max_pages")
-        page = 0
+        docs: List[dict] = []
-        docs = []
+        while len(docs) < max_pages:
        while page < limit:
            get_pages = retry(
                reraise=True,
                stop=stop_after_attempt(
@ -271,16 +302,18 @@ class ConfluenceLoader(BaseLoader):
                ),
                before_sleep=before_sleep_log(logger, logging.WARNING),
            )(retrieval_method)
-            batch = get_pages(**kwargs, start=page)
+            batch = get_pages(**kwargs, start=len(docs))
-            if len(batch) < limit:
+            if not batch:
-                page = limit
+                break
            else:
                page += len(batch)
            docs.extend(batch)
-        return docs
+        return docs[:max_pages]
    def process_page(
-        self, page: dict, include_attachments: bool, text_maker: Any
+        self,
        page: dict,
        include_attachments: bool,
        include_comments: bool,
        text_maker: Any,
    ) -> Document:
        if include_attachments:
            attachment_texts = self.process_attachment(page["id"])
@ -289,8 +322,23 @@ class ConfluenceLoader(BaseLoader):
        text = text_maker.handle(page["body"]["storage"]["value"]) + "".join(
            attachment_texts
        )
        if include_comments:
            comments = self.confluence.get_page_comments(
                page["id"], expand="body.view.value", depth="all"
            )["results"]
            comment_texts = [
                text_maker.handle(comment["body"]["view"]["value"])
                for comment in comments
            ]
            text = text + "".join(comment_texts)
        return Document(
-            page_content=text, metadata={"title": page["title"], "id": page["id"]}
+            page_content=text,
            metadata={
                "title": page["title"],
                "id": page["id"],
                "source": self.base_url.strip("/") + page["_links"]["webui"],
            },
        )
    def process_attachment(self, page_id: str) -> List[str]:
--- a/tests/integration_tests/document_loaders/test_confluence.py
+++ b/tests/integration_tests/document_loaders/test_confluence.py
@ -19,6 +19,10 @@ def test_load_single_confluence_page() -> None:
    assert docs[0].page_content is not None
    assert docs[0].metadata["id"] == "33189"
    assert docs[0].metadata["title"] == "An easy intro to using Confluence"
    assert docs[0].metadata["source"] == (
        "https://templates.atlassian.net/wiki/"
        "spaces/RD/pages/33189/An+easy+intro+to+using+Confluence"
    )
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
@ -33,7 +37,18 @@ def test_load_full_confluence_space() -> None:
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
 def test_confluence_pagination() -> None:
    loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/")
-    docs = loader.load(space_key="RD", limit=5)
+    # this will issue 2 requests; each with a limit of 3 until the max_pages of 5 is met
    docs = loader.load(space_key="RD", limit=3, max_pages=5)
    assert len(docs) == 5
    assert docs[0].page_content is not None
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
 def test_pass_confluence_kwargs() -> None:
    loader = ConfluenceLoader(
        url="https://templates.atlassian.net/wiki/",
        confluence_kwargs={"verify_ssl": False},
    )
    assert loader.confluence.verify_ssl is False