From 3b6206af49a32d947a75965a5167c8726e1d5639 Mon Sep 17 00:00:00 2001 From: Li Yuanzheng Date: Mon, 15 May 2023 11:09:27 +0800 Subject: [PATCH] Respect User-Specified User-Agent in WebBaseLoader (#4579) # Respect User-Specified User-Agent in WebBaseLoader This pull request modifies the `WebBaseLoader` class initializer from the `langchain.document_loaders.web_base` module to preserve any User-Agent specified by the user in the `header_template` parameter. Previously, even if a User-Agent was specified in `header_template`, it would always be overridden by a random User-Agent generated by the `fake_useragent` library. With this change, if a User-Agent is specified in `header_template`, it will be used. Only in the case where no User-Agent is specified will a random User-Agent be generated and used. This provides additional flexibility when using the `WebBaseLoader` class, allowing users to specify their own User-Agent if they have a specific need or preference, while still providing a reasonable default for cases where no User-Agent is specified. This change has no impact on existing users who do not specify a User-Agent, as the behavior in this case remains the same. However, for users who do specify a User-Agent, their choice will now be respected and used for all subsequent requests made using the `WebBaseLoader` class. Fixes #4167 ## Before submitting ============================= test session starts ============================== collecting ... collected 1 item test_web_base.py::TestWebBaseLoader::test_respect_user_specified_user_agent ============================== 1 passed in 3.64s =============================== PASSED [100%] ## Who can review? Community members can review the PR once tests pass. Tag maintainers/contributors who might be interested: @eyurtsev --------- Co-authored-by: Eugene Yurtsev --- langchain/document_loaders/web_base.py | 24 ++++++++++--------- .../document_loader/test_web_base.py | 10 ++++++++ 2 files changed, 23 insertions(+), 11 deletions(-) create mode 100644 tests/unit_tests/document_loader/test_web_base.py diff --git a/langchain/document_loaders/web_base.py b/langchain/document_loaders/web_base.py index f39f361fa2..4c7c6cc023 100644 --- a/langchain/document_loaders/web_base.py +++ b/langchain/document_loaders/web_base.py @@ -68,17 +68,19 @@ class WebBaseLoader(BaseLoader): "bs4 package not found, please install it with " "`pip install bs4`" ) - try: - from fake_useragent import UserAgent - - headers = header_template or default_header_template - headers["User-Agent"] = UserAgent().random - self.session.headers = dict(headers) - except ImportError: - logger.info( - "fake_useragent not found, using default user agent. " - "To get a realistic header for requests, `pip install fake_useragent`." - ) + headers = header_template or default_header_template + if not headers.get("User-Agent"): + try: + from fake_useragent import UserAgent + + headers["User-Agent"] = UserAgent().random + except ImportError: + logger.info( + "fake_useragent not found, using default user agent." + "To get a realistic header for requests, " + "`pip install fake_useragent`." + ) + self.session.headers = dict(headers) @property def web_path(self) -> str: diff --git a/tests/unit_tests/document_loader/test_web_base.py b/tests/unit_tests/document_loader/test_web_base.py new file mode 100644 index 0000000000..fe6839a529 --- /dev/null +++ b/tests/unit_tests/document_loader/test_web_base.py @@ -0,0 +1,10 @@ +from langchain.document_loaders.web_base import WebBaseLoader + + +class TestWebBaseLoader: + def test_respect_user_specified_user_agent(self) -> None: + user_specified_user_agent = "user_specified_user_agent" + header_template = {"User-Agent": user_specified_user_agent} + url = "https://www.example.com" + loader = WebBaseLoader(url, header_template=header_template) + assert loader.session.headers["User-Agent"] == user_specified_user_agent