diff --git a/langchain/document_loaders/confluence.py b/langchain/document_loaders/confluence.py index a90cfb801f..36b9180d0a 100644 --- a/langchain/document_loaders/confluence.py +++ b/langchain/document_loaders/confluence.py @@ -1,5 +1,6 @@ """Load Data from a Confluence Space""" import logging +from enum import Enum from io import BytesIO from typing import Any, Callable, Dict, List, Optional, Union @@ -16,6 +17,19 @@ from langchain.document_loaders.base import BaseLoader logger = logging.getLogger(__name__) +class ContentFormat(str, Enum): + STORAGE = "body.storage" + VIEW = "body.view" + + def get_content(self, page: dict) -> str: + if self == ContentFormat.STORAGE: + return page["body"]["storage"]["value"] + elif self == ContentFormat.VIEW: + return page["body"]["view"]["value"] + + raise ValueError("unknown content format") + + class ConfluenceLoader(BaseLoader): """ Load Confluence pages. Port of https://llamahub.ai/l/confluence @@ -31,6 +45,12 @@ class ConfluenceLoader(BaseLoader): Document object. Currently supported attachment types are: PDF, PNG, JPEG/JPG, SVG, Word and Excel. + Confluence API supports difference format of page content. The storage format is the + raw XML representation for storage. The view format is the HTML representation for + viewing with macros are rendered as though it is viewed by users. You can pass + a enum `content_format` argument to `load()` to specify the content format, this is + set to `ContentFormat.STORAGE` by default. + Hint: space_key and page_id can both be found in the URL of a page in Confluence - https://yoursite.atlassian.com/wiki/spaces//pages/ @@ -178,6 +198,7 @@ class ConfluenceLoader(BaseLoader): include_archived_content: bool = False, include_attachments: bool = False, include_comments: bool = False, + content_format: ContentFormat = ContentFormat.STORAGE, limit: Optional[int] = 50, max_pages: Optional[int] = 1000, ocr_languages: Optional[str] = None, @@ -200,6 +221,8 @@ class ConfluenceLoader(BaseLoader): :type include_attachments: bool, optional :param include_comments: defaults to False :type include_comments: bool, optional + :param content_format: Specify content format, defaults to ContentFormat.STORAGE + :type content_format: ContentFormat :param limit: Maximum number of pages to retrieve per request, defaults to 50 :type limit: int, optional :param max_pages: Maximum number of pages to retrieve in total, defaults 1000 @@ -228,13 +251,14 @@ class ConfluenceLoader(BaseLoader): limit=limit, max_pages=max_pages, status="any" if include_archived_content else "current", - expand="body.storage.value", + expand=content_format.value, ) docs += self.process_pages( pages, include_restricted_content, include_attachments, include_comments, + content_format, ocr_languages, ) @@ -258,13 +282,14 @@ class ConfluenceLoader(BaseLoader): limit=limit, max_pages=max_pages, include_archived_spaces=include_archived_content, - expand="body.storage.value", + expand=content_format.value, ) docs += self.process_pages( pages, include_restricted_content, include_attachments, include_comments, + content_format, ocr_languages, ) @@ -282,11 +307,15 @@ class ConfluenceLoader(BaseLoader): ), before_sleep=before_sleep_log(logger, logging.WARNING), )(self.confluence.get_page_by_id) - page = get_page(page_id=page_id, expand="body.storage.value") + page = get_page(page_id=page_id, expand=content_format.value) if not include_restricted_content and not self.is_public_page(page): continue doc = self.process_page( - page, include_attachments, include_comments, ocr_languages + page, + include_attachments, + include_comments, + content_format, + ocr_languages, ) docs.append(doc) @@ -363,6 +392,7 @@ class ConfluenceLoader(BaseLoader): include_restricted_content: bool, include_attachments: bool, include_comments: bool, + content_format: ContentFormat, ocr_languages: Optional[str] = None, ) -> List[Document]: """Process a list of pages into a list of documents.""" @@ -371,7 +401,11 @@ class ConfluenceLoader(BaseLoader): if not include_restricted_content and not self.is_public_page(page): continue doc = self.process_page( - page, include_attachments, include_comments, ocr_languages + page, + include_attachments, + include_comments, + content_format, + ocr_languages, ) docs.append(doc) @@ -382,6 +416,7 @@ class ConfluenceLoader(BaseLoader): page: dict, include_attachments: bool, include_comments: bool, + content_format: ContentFormat, ocr_languages: Optional[str] = None, ) -> Document: try: @@ -396,9 +431,11 @@ class ConfluenceLoader(BaseLoader): attachment_texts = self.process_attachment(page["id"], ocr_languages) else: attachment_texts = [] - text = BeautifulSoup(page["body"]["storage"]["value"], "lxml").get_text( - " ", strip=True - ) + "".join(attachment_texts) + + content = content_format.get_content(page) + text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join( + attachment_texts + ) if include_comments: comments = self.confluence.get_page_comments( page["id"], expand="body.view.value", depth="all"