From 2c114fcb5ecc0a9e75e8acb63d9dd5b4a6ced9a9 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Thu, 28 Sep 2023 22:36:46 -0400 Subject: [PATCH] Fix web-base loader (#11135) Fix initialization https://github.com/langchain-ai/langchain/issues/11095 --- libs/langchain/langchain/document_loaders/web_base.py | 7 ++++++- .../tests/unit_tests/document_loaders/test_web_base.py | 8 ++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/libs/langchain/langchain/document_loaders/web_base.py b/libs/langchain/langchain/document_loaders/web_base.py index 3c4677267a..5d1a9daa48 100644 --- a/libs/langchain/langchain/document_loaders/web_base.py +++ b/libs/langchain/langchain/document_loaders/web_base.py @@ -76,10 +76,15 @@ class WebBaseLoader(BaseLoader): ) if web_paths: self.web_paths = list(web_paths) + elif isinstance(web_path, str): + self.web_paths = [web_path] elif isinstance(web_path, Sequence): self.web_paths = list(web_path) else: - self.web_paths = [web_path] + raise TypeError( + f"web_path must be str or Sequence[str] got ({type(web_path)}) or" + f" web_paths must be Sequence[str] got ({type(web_paths)})" + ) self.requests_per_second = requests_per_second self.default_parser = default_parser self.requests_kwargs = requests_kwargs or {} diff --git a/libs/langchain/tests/unit_tests/document_loaders/test_web_base.py b/libs/langchain/tests/unit_tests/document_loaders/test_web_base.py index ecbf423dbf..41a81a15a5 100644 --- a/libs/langchain/tests/unit_tests/document_loaders/test_web_base.py +++ b/libs/langchain/tests/unit_tests/document_loaders/test_web_base.py @@ -11,3 +11,11 @@ class TestWebBaseLoader: url = "https://www.example.com" loader = WebBaseLoader(url, header_template=header_template) assert loader.session.headers["User-Agent"] == user_specified_user_agent + + def test_web_path_parameter(self) -> None: + web_base_loader = WebBaseLoader(web_paths=["https://www.example.com"]) + assert web_base_loader.web_paths == ["https://www.example.com"] + web_base_loader = WebBaseLoader(web_path=["https://www.example.com"]) + assert web_base_loader.web_paths == ["https://www.example.com"] + web_base_loader = WebBaseLoader(web_path="https://www.example.com") + assert web_base_loader.web_paths == ["https://www.example.com"]