From fa4c35e9e548f77d41f2da9ed0c2c20ca6277c57 Mon Sep 17 00:00:00 2001 From: Kohei Kumazaki Date: Sat, 29 Apr 2023 12:56:33 +0900 Subject: [PATCH] Fix encoding issue in WebBaseLoader (#3602) The character code mismatches occurred when character information was not included in the response header (In my case, a Japanese web page). I solved this issue by changing the encoding setting to apparent_encoding. --- langchain/document_loaders/web_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/langchain/document_loaders/web_base.py b/langchain/document_loaders/web_base.py index 50cf549d..1d4e90c5 100644 --- a/langchain/document_loaders/web_base.py +++ b/langchain/document_loaders/web_base.py @@ -169,6 +169,7 @@ class WebBaseLoader(BaseLoader): self._check_parser(parser) html_doc = self.session.get(url) + html_doc.encoding = html_doc.apparent_encoding return BeautifulSoup(html_doc.text, parser) def scrape(self, parser: Union[str, None] = None) -> Any: