From d895614d19e3edaa0acd65b100ae355c4659c6dc Mon Sep 17 00:00:00 2001
From: blueoom <4945756+anexplore@users.noreply.github.com>
Date: Tue, 16 Jul 2024 00:43:56 +0800
Subject: [PATCH] =?UTF-8?q?text=5Fsplitters:=20add=20request=20parameters?=
 =?UTF-8?q?=20for=20function=20HTMLHeaderTextSplitter.split=5Ftext?=
 =?UTF-8?q?=E2=80=A6=20(#24178)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

**Description:**

The `split_text_from_url` method of `HTMLHeaderTextSplitter` does not
include parameters like `timeout` when using `requests` to send a
request. Therefore, I suggest adding a `kwargs` parameter to the
function, which can be passed as arguments to `requests.get()`
internally, allowing control over the `get` request.

---------

Co-authored-by: Chester Curme <chester.curme@gmail.com>
---
 libs/text-splitters/langchain_text_splitters/html.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py
index 5cd1fce74d..cdbea7f724 100644
--- a/libs/text-splitters/langchain_text_splitters/html.py
+++ b/libs/text-splitters/langchain_text_splitters/html.py
@@ -71,13 +71,15 @@ class HTMLHeaderTextSplitter:
             for chunk in aggregated_chunks
         ]
 
-    def split_text_from_url(self, url: str) -> List[Document]:
+    def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]:
         """Split HTML from web URL
 
         Args:
             url: web URL
+            **kwargs: Arbitrary additional keyword arguments. These are usually passed
+                to the fetch url content request.
         """
-        r = requests.get(url)
+        r = requests.get(url, **kwargs)
         return self.split_text_from_file(BytesIO(r.content))
 
     def split_text(self, text: str) -> List[Document]: