feat(confluence): add markdown format option (#8246)

# Description:
**Add the possibility to keep text as Markdown in the ConfluenceLoader**
Add a bool variable that allows to keep the Markdown format of the
Confluence pages.
It is useful because it allows to use MarkdownHeaderTextSplitter as a
DataSplitter.
If this variable in set to True in the load() method, the pages are
extracted using the markdownify library.

  # Issue: 
[4407](https://github.com/langchain-ai/langchain/issues/4407)
  # Dependencies: 
Add the markdownify library
  # Tag maintainer:
 @rlancemartin, @eyurtsev
  # Twitter handle:
 FloBastinHeyI - https://twitter.com/FloBastinHeyI

---------

Co-authored-by: Florian Bastin <florian.bastin@octo.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Bastin Florian 2023-07-27 00:00:27 +02:00 committed by GitHub
parent ee6ff96e28
commit a3ac9b23eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -205,6 +205,7 @@ class ConfluenceLoader(BaseLoader):
limit: Optional[int] = 50, limit: Optional[int] = 50,
max_pages: Optional[int] = 1000, max_pages: Optional[int] = 1000,
ocr_languages: Optional[str] = None, ocr_languages: Optional[str] = None,
keep_markdown_format: bool = False,
) -> List[Document]: ) -> List[Document]:
""" """
:param space_key: Space key retrieved from a confluence URL, defaults to None :param space_key: Space key retrieved from a confluence URL, defaults to None
@ -234,6 +235,9 @@ class ConfluenceLoader(BaseLoader):
language, you'll first need to install the appropriate language, you'll first need to install the appropriate
Tesseract language pack. Tesseract language pack.
:type ocr_languages: str, optional :type ocr_languages: str, optional
:param keep_markdown_format: Whether to keep the markdown format, defaults to
False
:type keep_markdown_format: bool
:raises ValueError: _description_ :raises ValueError: _description_
:raises ImportError: _description_ :raises ImportError: _description_
:return: _description_ :return: _description_
@ -263,6 +267,7 @@ class ConfluenceLoader(BaseLoader):
include_comments, include_comments,
content_format, content_format,
ocr_languages, ocr_languages,
keep_markdown_format,
) )
if label: if label:
@ -294,6 +299,7 @@ class ConfluenceLoader(BaseLoader):
include_comments, include_comments,
content_format, content_format,
ocr_languages, ocr_languages,
keep_markdown_format,
) )
if page_ids: if page_ids:
@ -319,6 +325,7 @@ class ConfluenceLoader(BaseLoader):
include_comments, include_comments,
content_format, content_format,
ocr_languages, ocr_languages,
keep_markdown_format,
) )
docs.append(doc) docs.append(doc)
@ -397,6 +404,7 @@ class ConfluenceLoader(BaseLoader):
include_comments: bool, include_comments: bool,
content_format: ContentFormat, content_format: ContentFormat,
ocr_languages: Optional[str] = None, ocr_languages: Optional[str] = None,
keep_markdown_format: Optional[bool] = False,
) -> List[Document]: ) -> List[Document]:
"""Process a list of pages into a list of documents.""" """Process a list of pages into a list of documents."""
docs = [] docs = []
@ -409,6 +417,7 @@ class ConfluenceLoader(BaseLoader):
include_comments, include_comments,
content_format, content_format,
ocr_languages, ocr_languages,
keep_markdown_format,
) )
docs.append(doc) docs.append(doc)
@ -421,24 +430,42 @@ class ConfluenceLoader(BaseLoader):
include_comments: bool, include_comments: bool,
content_format: ContentFormat, content_format: ContentFormat,
ocr_languages: Optional[str] = None, ocr_languages: Optional[str] = None,
keep_markdown_format: Optional[bool] = False,
) -> Document: ) -> Document:
try: if keep_markdown_format:
from bs4 import BeautifulSoup # type: ignore try:
except ImportError: from markdownify import markdownify
raise ImportError( except ImportError:
"`beautifulsoup4` package not found, please run " raise ImportError(
"`pip install beautifulsoup4`" "`markdownify` package not found, please run "
) "`pip install markdownify`"
)
else:
try:
from bs4 import BeautifulSoup # type: ignore
except ImportError:
raise ImportError(
"`beautifulsoup4` package not found, please run "
"`pip install beautifulsoup4`"
)
if include_attachments: if include_attachments:
attachment_texts = self.process_attachment(page["id"], ocr_languages) attachment_texts = self.process_attachment(page["id"], ocr_languages)
else: else:
attachment_texts = [] attachment_texts = []
content = content_format.get_content(page) if keep_markdown_format:
text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join( # Use markdownify to keep the page Markdown style
attachment_texts text = markdownify(
) page["body"]["storage"]["value"], heading_style="ATX"
) + "".join(attachment_texts)
else:
content = content_format.get_content(page)
text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join(
attachment_texts
)
if include_comments: if include_comments:
comments = self.confluence.get_page_comments( comments = self.confluence.get_page_comments(
page["id"], expand="body.view.value", depth="all" page["id"], expand="body.view.value", depth="all"