From 62603f2664516f19b414bdcc31dbad2ceb758303 Mon Sep 17 00:00:00 2001 From: Cory Zue Date: Tue, 19 Sep 2023 21:59:52 +0200 Subject: [PATCH] make auto-setting the encodings optional, alow explicitly setting it (#10774) I was trying to use web loaders on some spanish documentation (e.g. [this site](https://www.fromdoppler.com/es/mailing-tendencias/), but the auto-encoding introduced in https://github.com/langchain-ai/langchain/pull/3602 was detected as "MacRoman" instead of the (correct) "UTF-8". To address this, I've added the ability to disable the auto-encoding, as well as the ability to explicitly tell the loader what encoding to use. - **Description:** Makes auto-setting the encoding optional in `WebBaseLoader`, and introduces an `encoding` option to explicitly set it. - **Dependencies:** N/A - **Tag maintainer:** @hwchase17 - **Twitter handle:** @czue --- libs/langchain/langchain/document_loaders/web_base.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/web_base.py b/libs/langchain/langchain/document_loaders/web_base.py index e6ee1db045..b31a49ecc8 100644 --- a/libs/langchain/langchain/document_loaders/web_base.py +++ b/libs/langchain/langchain/document_loaders/web_base.py @@ -63,6 +63,8 @@ class WebBaseLoader(BaseLoader): verify_ssl: Optional[bool] = True, proxies: Optional[dict] = None, continue_on_failure: Optional[bool] = False, + autoset_encoding: Optional[bool] = True, + encoding: Optional[str] = None, ): """Initialize with webpage path.""" @@ -98,7 +100,8 @@ class WebBaseLoader(BaseLoader): self.session.headers = dict(headers) self.session.verify = verify_ssl self.continue_on_failure = continue_on_failure - + self.autoset_encoding = autoset_encoding + self.encoding = encoding if proxies: self.session.proxies.update(proxies) @@ -208,7 +211,11 @@ class WebBaseLoader(BaseLoader): html_doc = self.session.get(url, **self.requests_kwargs) if self.raise_for_status: html_doc.raise_for_status() - html_doc.encoding = html_doc.apparent_encoding + + if self.encoding is not None: + html_doc.encoding = self.encoding + elif self.autoset_encoding: + html_doc.encoding = html_doc.apparent_encoding return BeautifulSoup(html_doc.text, parser) def scrape(self, parser: Union[str, None] = None) -> Any: