From d895614d19e3edaa0acd65b100ae355c4659c6dc Mon Sep 17 00:00:00 2001 From: blueoom <4945756+anexplore@users.noreply.github.com> Date: Tue, 16 Jul 2024 00:43:56 +0800 Subject: [PATCH] =?UTF-8?q?text=5Fsplitters:=20add=20request=20parameters?= =?UTF-8?q?=20for=20function=20HTMLHeaderTextSplitter.split=5Ftext?= =?UTF-8?q?=E2=80=A6=20(#24178)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Description:** The `split_text_from_url` method of `HTMLHeaderTextSplitter` does not include parameters like `timeout` when using `requests` to send a request. Therefore, I suggest adding a `kwargs` parameter to the function, which can be passed as arguments to `requests.get()` internally, allowing control over the `get` request. --------- Co-authored-by: Chester Curme --- libs/text-splitters/langchain_text_splitters/html.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libs/text-splitters/langchain_text_splitters/html.py b/libs/text-splitters/langchain_text_splitters/html.py index 5cd1fce74d..cdbea7f724 100644 --- a/libs/text-splitters/langchain_text_splitters/html.py +++ b/libs/text-splitters/langchain_text_splitters/html.py @@ -71,13 +71,15 @@ class HTMLHeaderTextSplitter: for chunk in aggregated_chunks ] - def split_text_from_url(self, url: str) -> List[Document]: + def split_text_from_url(self, url: str, **kwargs: Any) -> List[Document]: """Split HTML from web URL Args: url: web URL + **kwargs: Arbitrary additional keyword arguments. These are usually passed + to the fetch url content request. """ - r = requests.get(url) + r = requests.get(url, **kwargs) return self.split_text_from_file(BytesIO(r.content)) def split_text(self, text: str) -> List[Document]: