From b81f98b8a66999117246fbc134fc09d64a04e230 Mon Sep 17 00:00:00 2001 From: Gardner Bickford Date: Sun, 4 Jun 2023 09:57:25 +1200 Subject: [PATCH] Update confluence.py to return spaces between elements (#5383) # Update confluence.py to return spaces between elements like headers and links. Please see https://stackoverflow.com/questions/48913975/how-to-return-nicely-formatted-text-in-beautifulsoup4-when-html-text-is-across-m Given: ```html
183 Main St
East Copper
Massachusetts
U S A
MA 01516-113
``` The document loader currently returns: ``` '183 Main StEast CopperMassachusettsU S A MA 01516-113' ``` After this change, the document loader will return: ``` 183 Main St East Copper Massachusetts U S A MA 01516-113 ``` @eyurtsev would you prefer this to be an option that can be passed in? --- langchain/document_loaders/confluence.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/langchain/document_loaders/confluence.py b/langchain/document_loaders/confluence.py index 2f3dde60..80e1ed12 100644 --- a/langchain/document_loaders/confluence.py +++ b/langchain/document_loaders/confluence.py @@ -347,15 +347,17 @@ class ConfluenceLoader(BaseLoader): attachment_texts = self.process_attachment(page["id"]) else: attachment_texts = [] - text = BeautifulSoup( - page["body"]["storage"]["value"], "lxml" - ).get_text() + "".join(attachment_texts) + text = BeautifulSoup(page["body"]["storage"]["value"], "lxml").get_text( + " ", strip=True + ) + "".join(attachment_texts) if include_comments: comments = self.confluence.get_page_comments( page["id"], expand="body.view.value", depth="all" )["results"] comment_texts = [ - BeautifulSoup(comment["body"]["view"]["value"], "lxml").get_text() + BeautifulSoup(comment["body"]["view"]["value"], "lxml").get_text( + " ", strip=True + ) for comment in comments ] text = text + "".join(comment_texts)