community[minor]: allow enabling proxy in aiohttp session in AsyncHTML (#19499)

Allow enabling proxy in aiohttp session async html
1 month ago · 1f81277b9b
parent 36813d2f00
commit 1f81277b9b
3 changed files with 20 additions and 2 deletions
--- a/docs/docs/integrations/document_loaders/async_html.ipynb
+++ b/docs/docs/integrations/document_loaders/async_html.ipynb
@ -37,6 +37,10 @@
   "source": [
    "urls = [\"https://www.espn.com\", \"https://lilianweng.github.io/posts/2023-06-23-agent/\"]\n",
    "loader = AsyncHtmlLoader(urls)\n",
+    "# If you need to use the proxy to make web requests, for example using http_proxy/https_proxy environmental variables,\n",
+    "# please set trust_env=True explicitly here as follows:\n",
+    "# loader = AsyncHtmlLoader(urls, trust_env=True)\n",
+    "# Otherwise, loader.load() may stuck becuase aiohttp session does not recognize the proxy by default\n",
    "docs = loader.load()"
   ]
  },
--- a/libs/community/langchain_community/document_loaders/async_html.py
+++ b/libs/community/langchain_community/document_loaders/async_html.py
@ -64,6 +64,7 @@ class AsyncHtmlLoader(BaseLoader):
        ignore_load_errors: bool = False,
        *,
        preserve_order: bool = True,
+        trust_env: bool = False,
    ):
        """Initialize with a webpage path."""

@ -104,6 +105,8 @@ class AsyncHtmlLoader(BaseLoader):
        self.ignore_load_errors = ignore_load_errors
        self.preserve_order = preserve_order

+        self.trust_env = trust_env
+
    def _fetch_valid_connection_docs(self, url: str) -> Any:
        if self.ignore_load_errors:
            try:
@ -126,7 +129,7 @@ class AsyncHtmlLoader(BaseLoader):
    async def _fetch(
        self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
    ) -> str:
-        async with aiohttp.ClientSession() as session:
+        async with aiohttp.ClientSession(trust_env=self.trust_env) as session:
            for i in range(retries):
                try:
                    async with session.get(
--- a/libs/community/langchain_community/retrievers/web_research.py
+++ b/libs/community/langchain_community/retrievers/web_research.py
@ -75,6 +75,11 @@ class WebResearchRetriever(BaseRetriever):
    url_database: List[str] = Field(
        default_factory=list, description="List of processed URLs"
    )
+    trust_env: bool = Field(
+        False,
+        description="Whether to use the http_proxy/https_proxy env variables or "
+        "check .netrc for proxy configuration",
+    )

    @classmethod
    def from_llm(
@ -87,6 +92,7 @@ class WebResearchRetriever(BaseRetriever):
        text_splitter: RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter(
            chunk_size=1500, chunk_overlap=150
        ),
+        trust_env: bool = False,
    ) -> "WebResearchRetriever":
        """Initialize from llm using default template.

@ -97,6 +103,8 @@ class WebResearchRetriever(BaseRetriever):
            prompt: prompt to generating search questions
            num_search_results: Number of pages per Google search
            text_splitter: Text splitter for splitting web pages into chunks
+            trust_env: Whether to use the http_proxy/https_proxy env variables
+                or check .netrc for proxy configuration

        Returns:
            WebResearchRetriever
@ -124,6 +132,7 @@ class WebResearchRetriever(BaseRetriever):
            search=search,
            num_search_results=num_search_results,
            text_splitter=text_splitter,
+            trust_env=trust_env,
        )

    def clean_search_query(self, query: str) -> str:
@ -191,7 +200,9 @@ class WebResearchRetriever(BaseRetriever):
        logger.info(f"New URLs to load: {new_urls}")
        # Load, split, and add new urls to vectorstore
        if new_urls:
-            loader = AsyncHtmlLoader(new_urls, ignore_load_errors=True)
+            loader = AsyncHtmlLoader(
+                new_urls, ignore_load_errors=True, trust_env=self.trust_env
+            )
            html2text = Html2TextTransformer()
            logger.info("Indexing new urls...")
            docs = loader.load()