From a3ac9b23eb30daa49eaf1feafda43d88b213a95a Mon Sep 17 00:00:00 2001 From: Bastin Florian <33825469+BastinFlorian@users.noreply.github.com> Date: Thu, 27 Jul 2023 00:00:27 +0200 Subject: [PATCH] feat(confluence): add markdown format option (#8246) # Description: **Add the possibility to keep text as Markdown in the ConfluenceLoader** Add a bool variable that allows to keep the Markdown format of the Confluence pages. It is useful because it allows to use MarkdownHeaderTextSplitter as a DataSplitter. If this variable in set to True in the load() method, the pages are extracted using the markdownify library. # Issue: [4407](https://github.com/langchain-ai/langchain/issues/4407) # Dependencies: Add the markdownify library # Tag maintainer: @rlancemartin, @eyurtsev # Twitter handle: FloBastinHeyI - https://twitter.com/FloBastinHeyI --------- Co-authored-by: Florian Bastin Co-authored-by: Bagatur --- .../langchain/document_loaders/confluence.py | 49 ++++++++++++++----- 1 file changed, 38 insertions(+), 11 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/confluence.py b/libs/langchain/langchain/document_loaders/confluence.py index 9619071332..8615faa7d8 100644 --- a/libs/langchain/langchain/document_loaders/confluence.py +++ b/libs/langchain/langchain/document_loaders/confluence.py @@ -205,6 +205,7 @@ class ConfluenceLoader(BaseLoader): limit: Optional[int] = 50, max_pages: Optional[int] = 1000, ocr_languages: Optional[str] = None, + keep_markdown_format: bool = False, ) -> List[Document]: """ :param space_key: Space key retrieved from a confluence URL, defaults to None @@ -234,6 +235,9 @@ class ConfluenceLoader(BaseLoader): language, you'll first need to install the appropriate Tesseract language pack. :type ocr_languages: str, optional + :param keep_markdown_format: Whether to keep the markdown format, defaults to + False + :type keep_markdown_format: bool :raises ValueError: _description_ :raises ImportError: _description_ :return: _description_ @@ -263,6 +267,7 @@ class ConfluenceLoader(BaseLoader): include_comments, content_format, ocr_languages, + keep_markdown_format, ) if label: @@ -294,6 +299,7 @@ class ConfluenceLoader(BaseLoader): include_comments, content_format, ocr_languages, + keep_markdown_format, ) if page_ids: @@ -319,6 +325,7 @@ class ConfluenceLoader(BaseLoader): include_comments, content_format, ocr_languages, + keep_markdown_format, ) docs.append(doc) @@ -397,6 +404,7 @@ class ConfluenceLoader(BaseLoader): include_comments: bool, content_format: ContentFormat, ocr_languages: Optional[str] = None, + keep_markdown_format: Optional[bool] = False, ) -> List[Document]: """Process a list of pages into a list of documents.""" docs = [] @@ -409,6 +417,7 @@ class ConfluenceLoader(BaseLoader): include_comments, content_format, ocr_languages, + keep_markdown_format, ) docs.append(doc) @@ -421,24 +430,42 @@ class ConfluenceLoader(BaseLoader): include_comments: bool, content_format: ContentFormat, ocr_languages: Optional[str] = None, + keep_markdown_format: Optional[bool] = False, ) -> Document: - try: - from bs4 import BeautifulSoup # type: ignore - except ImportError: - raise ImportError( - "`beautifulsoup4` package not found, please run " - "`pip install beautifulsoup4`" - ) + if keep_markdown_format: + try: + from markdownify import markdownify + except ImportError: + raise ImportError( + "`markdownify` package not found, please run " + "`pip install markdownify`" + ) + else: + try: + from bs4 import BeautifulSoup # type: ignore + except ImportError: + raise ImportError( + "`beautifulsoup4` package not found, please run " + "`pip install beautifulsoup4`" + ) if include_attachments: attachment_texts = self.process_attachment(page["id"], ocr_languages) else: attachment_texts = [] - content = content_format.get_content(page) - text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join( - attachment_texts - ) + if keep_markdown_format: + # Use markdownify to keep the page Markdown style + text = markdownify( + page["body"]["storage"]["value"], heading_style="ATX" + ) + "".join(attachment_texts) + + else: + content = content_format.get_content(page) + text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join( + attachment_texts + ) + if include_comments: comments = self.confluence.get_page_comments( page["id"], expand="body.view.value", depth="all"