community[patch]: Force opt-in for WebResearchRetriever (CVE-2024-3095) (#24451)

This PR addresses the issue raised by (CVE-2024-3095) https://huntr.com/bounties/e62d4895-2901-405b-9559-38276b6a5273 Unfortunately, we didn't do a good job writing the initial report. It's pointing at both the wrong package and the wrong code. The affected code is the Web Retriever not the AsyncHTMLLoader, and the WebRetriever lives in langchain-community The vulnerable code lives here: 0bd3f4e129/libs/community/langchain_community/retrievers/web_research.py (L233-L233) This PR adds a forced opt-in for users to make sure they are aware of the risk and can mitigate by configuring a proxy: 0bd3f4e129/libs/community/langchain_community/retrievers/web_research.py (L84-L84)
2024-11-10 01:10:59 +00:00 · 2024-07-19 14:51:35 -04:00 · 2024-07-19 14:51:35 -04:00 · 604dfe2d99
commit 604dfe2d99
parent f101c759ed
1 changed files with 30 additions and 1 deletions
--- a/libs/community/langchain_community/retrievers/web_research.py
+++ b/libs/community/langchain_community/retrievers/web_research.py
@ -1,6 +1,6 @@
 import logging
 import re
-from typing import List, Optional
+from typing import Any, List, Optional

 from langchain.chains import LLMChain
 from langchain.chains.prompt_selector import ConditionalPromptSelector
@ -81,6 +81,35 @@ class WebResearchRetriever(BaseRetriever):
        "check .netrc for proxy configuration",
    )

+    allow_dangerous_requests: bool = False
+    """A flag to force users to acknowledge the risks of SSRF attacks when using 
+    this retriever.
+    
+    Users should set this flag to `True` if they have taken the necessary precautions
+    to prevent SSRF attacks when using this retriever.
+    
+    For example, users can run the requests through a properly configured
+    proxy and prevent the crawler from accidentally crawling internal resources.
+    """
+
+    def __init__(self, **kwargs: Any) -> None:
+        """Initialize the retriever."""
+        allow_dangerous_requests = kwargs.get("allow_dangerous_requests", False)
+        if not allow_dangerous_requests:
+            raise ValueError(
+                "WebResearchRetriever crawls URLs surfaced through "
+                "the provided search engine. It is possible that some of those URLs "
+                "will end up pointing to machines residing on an internal network, "
+                "leading"
+                "to an SSRF (Server-Side Request Forgery) attack. "
+                "To protect yourself against that risk, you can run the requests "
+                "through a proxy and prevent the crawler from accidentally crawling "
+                "internal resources."
+                "If've taken the necessary precautions, you can set "
+                "`allow_dangerous_requests` to `True`."
+            )
+        super().__init__(**kwargs)
+
    @classmethod
    def from_llm(
        cls,