diff --git a/langchain/document_loaders/confluence.py b/langchain/document_loaders/confluence.py index 2f3dde60..80e1ed12 100644 --- a/langchain/document_loaders/confluence.py +++ b/langchain/document_loaders/confluence.py @@ -347,15 +347,17 @@ class ConfluenceLoader(BaseLoader): attachment_texts = self.process_attachment(page["id"]) else: attachment_texts = [] - text = BeautifulSoup( - page["body"]["storage"]["value"], "lxml" - ).get_text() + "".join(attachment_texts) + text = BeautifulSoup(page["body"]["storage"]["value"], "lxml").get_text( + " ", strip=True + ) + "".join(attachment_texts) if include_comments: comments = self.confluence.get_page_comments( page["id"], expand="body.view.value", depth="all" )["results"] comment_texts = [ - BeautifulSoup(comment["body"]["view"]["value"], "lxml").get_text() + BeautifulSoup(comment["body"]["view"]["value"], "lxml").get_text( + " ", strip=True + ) for comment in comments ] text = text + "".join(comment_texts)