mirror of
https://github.com/hwchase17/langchain
synced 2024-11-04 06:00:26 +00:00
feat(confluence): add markdown format option (#8246)
# Description: **Add the possibility to keep text as Markdown in the ConfluenceLoader** Add a bool variable that allows to keep the Markdown format of the Confluence pages. It is useful because it allows to use MarkdownHeaderTextSplitter as a DataSplitter. If this variable in set to True in the load() method, the pages are extracted using the markdownify library. # Issue: [4407](https://github.com/langchain-ai/langchain/issues/4407) # Dependencies: Add the markdownify library # Tag maintainer: @rlancemartin, @eyurtsev # Twitter handle: FloBastinHeyI - https://twitter.com/FloBastinHeyI --------- Co-authored-by: Florian Bastin <florian.bastin@octo.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
ee6ff96e28
commit
a3ac9b23eb
@ -205,6 +205,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
limit: Optional[int] = 50,
|
limit: Optional[int] = 50,
|
||||||
max_pages: Optional[int] = 1000,
|
max_pages: Optional[int] = 1000,
|
||||||
ocr_languages: Optional[str] = None,
|
ocr_languages: Optional[str] = None,
|
||||||
|
keep_markdown_format: bool = False,
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
"""
|
"""
|
||||||
:param space_key: Space key retrieved from a confluence URL, defaults to None
|
:param space_key: Space key retrieved from a confluence URL, defaults to None
|
||||||
@ -234,6 +235,9 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
language, you'll first need to install the appropriate
|
language, you'll first need to install the appropriate
|
||||||
Tesseract language pack.
|
Tesseract language pack.
|
||||||
:type ocr_languages: str, optional
|
:type ocr_languages: str, optional
|
||||||
|
:param keep_markdown_format: Whether to keep the markdown format, defaults to
|
||||||
|
False
|
||||||
|
:type keep_markdown_format: bool
|
||||||
:raises ValueError: _description_
|
:raises ValueError: _description_
|
||||||
:raises ImportError: _description_
|
:raises ImportError: _description_
|
||||||
:return: _description_
|
:return: _description_
|
||||||
@ -263,6 +267,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
include_comments,
|
include_comments,
|
||||||
content_format,
|
content_format,
|
||||||
ocr_languages,
|
ocr_languages,
|
||||||
|
keep_markdown_format,
|
||||||
)
|
)
|
||||||
|
|
||||||
if label:
|
if label:
|
||||||
@ -294,6 +299,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
include_comments,
|
include_comments,
|
||||||
content_format,
|
content_format,
|
||||||
ocr_languages,
|
ocr_languages,
|
||||||
|
keep_markdown_format,
|
||||||
)
|
)
|
||||||
|
|
||||||
if page_ids:
|
if page_ids:
|
||||||
@ -319,6 +325,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
include_comments,
|
include_comments,
|
||||||
content_format,
|
content_format,
|
||||||
ocr_languages,
|
ocr_languages,
|
||||||
|
keep_markdown_format,
|
||||||
)
|
)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
|
|
||||||
@ -397,6 +404,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
include_comments: bool,
|
include_comments: bool,
|
||||||
content_format: ContentFormat,
|
content_format: ContentFormat,
|
||||||
ocr_languages: Optional[str] = None,
|
ocr_languages: Optional[str] = None,
|
||||||
|
keep_markdown_format: Optional[bool] = False,
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
"""Process a list of pages into a list of documents."""
|
"""Process a list of pages into a list of documents."""
|
||||||
docs = []
|
docs = []
|
||||||
@ -409,6 +417,7 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
include_comments,
|
include_comments,
|
||||||
content_format,
|
content_format,
|
||||||
ocr_languages,
|
ocr_languages,
|
||||||
|
keep_markdown_format,
|
||||||
)
|
)
|
||||||
docs.append(doc)
|
docs.append(doc)
|
||||||
|
|
||||||
@ -421,24 +430,42 @@ class ConfluenceLoader(BaseLoader):
|
|||||||
include_comments: bool,
|
include_comments: bool,
|
||||||
content_format: ContentFormat,
|
content_format: ContentFormat,
|
||||||
ocr_languages: Optional[str] = None,
|
ocr_languages: Optional[str] = None,
|
||||||
|
keep_markdown_format: Optional[bool] = False,
|
||||||
) -> Document:
|
) -> Document:
|
||||||
try:
|
if keep_markdown_format:
|
||||||
from bs4 import BeautifulSoup # type: ignore
|
try:
|
||||||
except ImportError:
|
from markdownify import markdownify
|
||||||
raise ImportError(
|
except ImportError:
|
||||||
"`beautifulsoup4` package not found, please run "
|
raise ImportError(
|
||||||
"`pip install beautifulsoup4`"
|
"`markdownify` package not found, please run "
|
||||||
)
|
"`pip install markdownify`"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
from bs4 import BeautifulSoup # type: ignore
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"`beautifulsoup4` package not found, please run "
|
||||||
|
"`pip install beautifulsoup4`"
|
||||||
|
)
|
||||||
|
|
||||||
if include_attachments:
|
if include_attachments:
|
||||||
attachment_texts = self.process_attachment(page["id"], ocr_languages)
|
attachment_texts = self.process_attachment(page["id"], ocr_languages)
|
||||||
else:
|
else:
|
||||||
attachment_texts = []
|
attachment_texts = []
|
||||||
|
|
||||||
content = content_format.get_content(page)
|
if keep_markdown_format:
|
||||||
text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join(
|
# Use markdownify to keep the page Markdown style
|
||||||
attachment_texts
|
text = markdownify(
|
||||||
)
|
page["body"]["storage"]["value"], heading_style="ATX"
|
||||||
|
) + "".join(attachment_texts)
|
||||||
|
|
||||||
|
else:
|
||||||
|
content = content_format.get_content(page)
|
||||||
|
text = BeautifulSoup(content, "lxml").get_text(" ", strip=True) + "".join(
|
||||||
|
attachment_texts
|
||||||
|
)
|
||||||
|
|
||||||
if include_comments:
|
if include_comments:
|
||||||
comments = self.confluence.get_page_comments(
|
comments = self.confluence.get_page_comments(
|
||||||
page["id"], expand="body.view.value", depth="all"
|
page["id"], expand="body.view.value", depth="all"
|
||||||
|
Loading…
Reference in New Issue
Block a user