forked from Archives/langchain
feat: add content_format param to ConfluenceLoader.load() (#5922)
Confluence API supports difference format of page content. The storage format is the raw XML representation for storage. The view format is the HTML representation for viewing with macros rendered as though it is viewed by users. Add the `content_format` parameter to `ConfluenceLoader.load()` to specify the content format, this is set to `ContentFormat.STORAGE` by default. #### Who can review? Tag maintainers/contributors who might be interested: @eyurtsev --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
parent
c5a46e7435
commit
7ad13cdbdb
@ -1,5 +1,6 @@
|
|||||||
"""Load Data from a Confluence Space"""
|
"""Load Data from a Confluence Space"""
|
||||||
import logging
|
import logging
|
||||||
|
from enum import Enum
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from typing import Any, Callable, Dict, List, Optional, Union
|
from typing import Any, Callable, Dict, List, Optional, Union
|
||||||
|
|
||||||
@ -16,6 +17,19 @@ from langchain.document_loaders.base import BaseLoader
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class ContentFormat(str, Enum):
|
||||||
|
STORAGE = "body.storage"
|
||||||
|
VIEW = "body.view"
|
||||||
|
|
||||||
|
def get_content(self, page: dict) -> str:
|
||||||
|
if self == ContentFormat.STORAGE:
|
||||||
|
return page["body"]["storage"]["value"]
|
||||||
|
elif self == ContentFormat.VIEW:
|
||||||
|
return page["body"]["view"]["value"]
|
||||||
|
|
||||||
|
raise ValueError("unknown content format")
|
||||||
|
|
||||||
|
|
||||||
class ConfluenceLoader(BaseLoader):
|
class ConfluenceLoader(BaseLoader):
|
||||||
"""
|
"""
|
||||||
Load Confluence pages. Port of https://llamahub.ai/l/confluence
|
Load Confluence pages. Port of https://llamahub.ai/l/confluence
|
||||||
@ -31,6 +45,12 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
Document object. Currently supported attachment types are: PDF, PNG, JPEG/JPG,
|
Document object. Currently supported attachment types are: PDF, PNG, JPEG/JPG,
|
||||||
SVG, Word and Excel.
|
SVG, Word and Excel.
|
||||||
|
|
||||||
|
Confluence API supports difference format of page content. The storage format is the
|
||||||
|
raw XML representation for storage. The view format is the HTML representation for
|
||||||
|
viewing with macros are rendered as though it is viewed by users. You can pass
|
||||||
|
a enum `content_format` argument to `load()` to specify the content format, this is
|
||||||
|
set to `ContentFormat.STORAGE` by default.
|
||||||
|
|
||||||
Hint: space_key and page_id can both be found in the URL of a page in Confluence
|
Hint: space_key and page_id can both be found in the URL of a page in Confluence
|
||||||
- https://yoursite.atlassian.com/wiki/spaces/<space_key>/pages/<page_id>
|
- https://yoursite.atlassian.com/wiki/spaces/<space_key>/pages/<page_id>
|
||||||
|
|
||||||
@ -178,6 +198,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
include_archived_content: bool = False,
|
include_archived_content: bool = False,
|
||||||
include_attachments: bool = False,
|
include_attachments: bool = False,
|
||||||
include_comments: bool = False,
|
include_comments: bool = False,
|
||||||
|
content_format: ContentFormat = ContentFormat.STORAGE,
|
||||||
limit: Optional[int] = 50,
|
limit: Optional[int] = 50,
|
||||||
max_pages: Optional[int] = 1000,
|
max_pages: Optional[int] = 1000,
|
||||||
ocr_languages: Optional[str] = None,
|
ocr_languages: Optional[str] = None,
|
||||||
@ -200,6 +221,8 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
:type include_attachments: bool, optional
|
:type include_attachments: bool, optional
|
||||||
:param include_comments: defaults to False
|
:param include_comments: defaults to False
|
||||||
:type include_comments: bool, optional
|
:type include_comments: bool, optional
|
||||||
|
:param content_format: Specify content format, defaults to ContentFormat.STORAGE
|
||||||
|
:type content_format: ContentFormat
|
||||||
:param limit: Maximum number of pages to retrieve per request, defaults to 50
|
:param limit: Maximum number of pages to retrieve per request, defaults to 50
|
||||||
:type limit: int, optional
|
:type limit: int, optional
|
||||||
:param max_pages: Maximum number of pages to retrieve in total, defaults 1000
|
:param max_pages: Maximum number of pages to retrieve in total, defaults 1000
|
||||||
@ -228,13 +251,14 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
limit=limit,
|
limit=limit,
|
||||||
max_pages=max_pages,
|
max_pages=max_pages,
|
||||||
status="any" if include_archived_content else "current",
|
status="any" if include_archived_content else "current",
|
||||||
expand="body.storage.value",
|
expand=content_format.value,
|
||||||
)
|
)
|
||||||
docs += self.process_pages(
|
docs += self.process_pages(
|
||||||
pages,
|
pages,
|
||||||
include_restricted_content,
|
include_restricted_content,
|
||||||
include_attachments,
|
include_attachments,
|
||||||
include_comments,
|
include_comments,
|
||||||
|
content_format,
|
||||||
ocr_languages,
|
ocr_languages,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -258,13 +282,14 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
limit=limit,
|
limit=limit,
|
||||||
max_pages=max_pages,
|
max_pages=max_pages,
|
||||||
include_archived_spaces=include_archived_content,
|
include_archived_spaces=include_archived_content,
|
||||||
expand="body.storage.value",
|
expand=content_format.value,
|
||||||
)
|
)
|
||||||
docs += self.process_pages(
|
docs += self.process_pages(
|
||||||
pages,
|
pages,
|
||||||
include_restricted_content,
|
include_restricted_content,
|
||||||
include_attachments,
|
include_attachments,
|
||||||
include_comments,
|
include_comments,
|
||||||
|
content_format,
|
||||||
ocr_languages,
|
ocr_languages,
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -282,11 +307,15 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
),
|
),
|
||||||
before_sleep=before_sleep_log(logger, logging.WARNING),
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
||||||
)(self.confluence.get_page_by_id)
|
)(self.confluence.get_page_by_id)
|
||||||
page = get_page(page_id=page_id, expand="body.storage.value")
|
page = get_page(page_id=page_id, expand=content_format.value)
|
||||||
if not include_restricted_content and not self.is_public_page(page):
|
if not include_restricted_content and not self.is_public_page(page):
|
||||||
continue
|
continue
|
||||||
doc = self.process_page(
|
doc = self.process_page(
|
||||||
page, include_attachments, include_comments, ocr_languages
|
page,
|
||||||
|
include_attachments,
|
||||||
|
include_comments,
|
||||||
|
content_format,
|
||||||
|
ocr_languages,
|
||||||
)
|
)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
|
|
||||||
@ -363,6 +392,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
include_restricted_content: bool,
|
include_restricted_content: bool,
|
||||||
include_attachments: bool,
|
include_attachments: bool,
|
||||||
include_comments: bool,
|
include_comments: bool,
|
||||||
|
content_format: ContentFormat,
|
||||||
ocr_languages: Optional[str] = None,
|
ocr_languages: Optional[str] = None,
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
"""Process a list of pages into a list of documents."""
|
"""Process a list of pages into a list of documents."""
|
||||||
@ -371,7 +401,11 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
if not include_restricted_content and not self.is_public_page(page):
|
if not include_restricted_content and not self.is_public_page(page):
|
||||||
continue
|
continue
|
||||||
doc = self.process_page(
|
doc = self.process_page(
|
||||||
page, include_attachments, include_comments, ocr_languages
|
page,
|
||||||
|
include_attachments,
|
||||||
|
include_comments,
|
||||||
|
content_format,
|
||||||
|
ocr_languages,
|
||||||
)
|
)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
|
|
||||||
@ -382,6 +416,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
page: dict,
|
page: dict,
|
||||||
include_attachments: bool,
|
include_attachments: bool,
|
||||||
include_comments: bool,
|
include_comments: bool,
|
||||||
|
content_format: ContentFormat,
|
||||||
ocr_languages: Optional[str] = None,
|
ocr_languages: Optional[str] = None,
|
||||||
) -> Document:
|
) -> Document:
|
||||||
try:
|
try:
|
||||||
@ -396,9 +431,11 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
attachment_texts = self.process_attachment(page["id"], ocr_languages)
|
attachment_texts = self.process_attachment(page["id"], ocr_languages)
|
||||||
else:
|
else:
|
||||||
attachment_texts = []
|
attachment_texts = []
|
||||||
text = BeautifulSoup(page["body"]["storage"]["value"], "lxml").get_text(
|
|
||||||
" ", strip=True
|
content = content_format.get_content(page)
|
||||||
) + "".join(attachment_texts)
|
text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join(
|
||||||
|
attachment_texts
|
||||||
|
)
|
||||||
if include_comments:
|
if include_comments:
|
||||||
comments = self.confluence.get_page_comments(
|
comments = self.confluence.get_page_comments(
|
||||||
page["id"], expand="body.view.value", depth="all"
|
page["id"], expand="body.view.value", depth="all"
|
||||||
|
Loading…
Reference in New Issue
Block a user