From aa345a4bb7320cace53a053692ad8be95902a83d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20Bry=C5=84ski?= Date: Thu, 27 Apr 2023 01:10:16 +0200 Subject: [PATCH] Add get_text_separator parameter to BSHTMLLoader (#3551) By default get_text doesn't separate content of different HTML tag. Adding option for specifying separator helps with document splitting. --- langchain/document_loaders/html_bs.py | 4 +++- tests/integration_tests/document_loaders/test_bshtml.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/langchain/document_loaders/html_bs.py b/langchain/document_loaders/html_bs.py index fc636367..4a73187a 100644 --- a/langchain/document_loaders/html_bs.py +++ b/langchain/document_loaders/html_bs.py @@ -17,6 +17,7 @@ class BSHTMLLoader(BaseLoader): file_path: str, open_encoding: Union[str, None] = None, bs_kwargs: Union[dict, None] = None, + get_text_separator: str = "", ) -> None: """Initialise with path, and optionally, file encoding to use, and any kwargs to pass to the BeautifulSoup object.""" @@ -33,6 +34,7 @@ class BSHTMLLoader(BaseLoader): if bs_kwargs is None: bs_kwargs = {"features": "lxml"} self.bs_kwargs = bs_kwargs + self.get_text_separator = get_text_separator def load(self) -> List[Document]: from bs4 import BeautifulSoup @@ -41,7 +43,7 @@ class BSHTMLLoader(BaseLoader): with open(self.file_path, "r", encoding=self.open_encoding) as f: soup = BeautifulSoup(f, **self.bs_kwargs) - text = soup.get_text() + text = soup.get_text(self.get_text_separator) if soup.title: title = str(soup.title.string) diff --git a/tests/integration_tests/document_loaders/test_bshtml.py b/tests/integration_tests/document_loaders/test_bshtml.py index 2510788e..038371fa 100644 --- a/tests/integration_tests/document_loaders/test_bshtml.py +++ b/tests/integration_tests/document_loaders/test_bshtml.py @@ -9,15 +9,17 @@ from langchain.document_loaders.html_bs import BSHTMLLoader def test_bs_html_loader() -> None: """Test unstructured loader.""" file_path = Path(__file__).parent.parent / "examples/example.html" - loader = BSHTMLLoader(str(file_path)) + loader = BSHTMLLoader(str(file_path), get_text_separator="|") docs = loader.load() assert len(docs) == 1 metadata = docs[0].metadata + content = docs[0].page_content assert metadata["title"] == "Chew dad's slippers" assert metadata["source"] == str(file_path) + assert content[:2] == "\n|" @pytest.mark.skipif(