From d18e788ee38280f1c5cc78f5709fa2e766766fe7 Mon Sep 17 00:00:00 2001 From: obbiondo <45012063+obbiondo@users.noreply.github.com> Date: Thu, 4 May 2023 06:52:05 +0200 Subject: [PATCH] bugfix: return whole document when loading with ConfluenceLoader.load by label (#3980) Method confluence.get_all_pages_by_label, returns only metadata about documents with a certain label (such as pageId, titles, ...). To return all documents with a certain label we need to extract all page ids given a certain label and get pages content by these ids. --------- Co-authored-by: Andrea Biondo --- langchain/document_loaders/confluence.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/langchain/document_loaders/confluence.py b/langchain/document_loaders/confluence.py index f02eb77493..f20ed5f366 100644 --- a/langchain/document_loaders/confluence.py +++ b/langchain/document_loaders/confluence.py @@ -219,9 +219,11 @@ class ConfluenceLoader(BaseLoader): max_pages=max_pages, expand="body.storage.value", ) - docs += self.process_pages( - pages, include_restricted_content, include_attachments, include_comments - ) + ids_by_label = [page["id"] for page in pages] + if page_ids: + page_ids = list(set(page_ids + ids_by_label)) + else: + page_ids = list(set(ids_by_label)) if cql: pages = self.paginate_request(