fix: TypeError when loading confluence pages by cql (#5878)

The Confluence loader uses the wrong API (`Confluence.cql()` provided by `atlassian-python-api`) to load pages by CQL. `Confluence.cql()` is a wrapper of the `/rest/api/search` API which searches for entities in Confluence. To search for pages in Confluence, the loader can use the `/rest/api/content/search` API. #### Who can review? Tag maintainers/contributors who might be interested: @eyurtsev  #### References ##### Cloud API https://developer.atlassian.com/cloud/confluence/rest/v1/api-group-content/#api-wiki-rest-api-content-search-get https://developer.atlassian.com/cloud/confluence/rest/v1/api-group-search/#api-wiki-rest-api-search-get ##### Server API https://docs.atlassian.com/ConfluenceServer/rest/8.3.1/#api/content-search https://docs.atlassian.com/ConfluenceServer/rest/8.3.1/#api/search
2023-06-12 04:23:22 +08:00 · 2023-06-12 04:23:22 +08:00 · 232faba796
commit 232faba796
parent d7d629911b
1 changed files with 15 additions and 2 deletions
--- a/langchain/document_loaders/confluence.py
+++ b/langchain/document_loaders/confluence.py
@ -1,7 +1,7 @@
 """Load Data from a Confluence Space"""
 import logging
 from io import BytesIO
-from typing import Any, Callable, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union

 from tenacity import (
    before_sleep_log,
@ -253,7 +253,7 @@ class ConfluenceLoader(BaseLoader):

        if cql:
            pages = self.paginate_request(
-                self.confluence.cql,
+                self._search_content_by_cql,
                cql=cql,
                limit=limit,
                max_pages=max_pages,
@ -292,6 +292,19 @@ class ConfluenceLoader(BaseLoader):

        return docs

+    def _search_content_by_cql(
+        self, cql: str, include_archived_spaces: Optional[bool] = None, **kwargs: Any
+    ) -> List[dict]:
+        url = "rest/api/content/search"
+
+        params: Dict[str, Any] = {"cql": cql}
+        params.update(kwargs)
+        if include_archived_spaces is not None:
+            params["includeArchivedSpaces"] = include_archived_spaces
+
+        response = self.confluence.get(url, params=params)
+        return response.get("results", [])
+
    def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
        """Paginate the various methods to retrieve groups of pages.