diff --git a/langchain/document_loaders/web_base.py b/langchain/document_loaders/web_base.py index f39f361f..4c7c6cc0 100644 --- a/langchain/document_loaders/web_base.py +++ b/langchain/document_loaders/web_base.py @@ -68,17 +68,19 @@ class WebBaseLoader(BaseLoader): "bs4 package not found, please install it with " "`pip install bs4`" ) - try: - from fake_useragent import UserAgent - - headers = header_template or default_header_template - headers["User-Agent"] = UserAgent().random - self.session.headers = dict(headers) - except ImportError: - logger.info( - "fake_useragent not found, using default user agent. " - "To get a realistic header for requests, `pip install fake_useragent`." - ) + headers = header_template or default_header_template + if not headers.get("User-Agent"): + try: + from fake_useragent import UserAgent + + headers["User-Agent"] = UserAgent().random + except ImportError: + logger.info( + "fake_useragent not found, using default user agent." + "To get a realistic header for requests, " + "`pip install fake_useragent`." + ) + self.session.headers = dict(headers) @property def web_path(self) -> str: diff --git a/tests/unit_tests/document_loader/test_web_base.py b/tests/unit_tests/document_loader/test_web_base.py new file mode 100644 index 00000000..fe6839a5 --- /dev/null +++ b/tests/unit_tests/document_loader/test_web_base.py @@ -0,0 +1,10 @@ +from langchain.document_loaders.web_base import WebBaseLoader + + +class TestWebBaseLoader: + def test_respect_user_specified_user_agent(self) -> None: + user_specified_user_agent = "user_specified_user_agent" + header_template = {"User-Agent": user_specified_user_agent} + url = "https://www.example.com" + loader = WebBaseLoader(url, header_template=header_template) + assert loader.session.headers["User-Agent"] == user_specified_user_agent