From fa4c35e9e548f77d41f2da9ed0c2c20ca6277c57 Mon Sep 17 00:00:00 2001
From: Kohei Kumazaki <kumazaki98+github@gmail.com>
Date: Sat, 29 Apr 2023 12:56:33 +0900
Subject: [PATCH] Fix encoding issue in WebBaseLoader (#3602)

The character code mismatches occurred when character information was
not included in the response header (In my case, a Japanese web page).
I solved this issue by changing the encoding setting to
apparent_encoding.
---
 langchain/document_loaders/web_base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/langchain/document_loaders/web_base.py b/langchain/document_loaders/web_base.py
index 50cf549d..1d4e90c5 100644
--- a/langchain/document_loaders/web_base.py
+++ b/langchain/document_loaders/web_base.py
@@ -169,6 +169,7 @@ class WebBaseLoader(BaseLoader):
         self._check_parser(parser)
 
         html_doc = self.session.get(url)
+        html_doc.encoding = html_doc.apparent_encoding
         return BeautifulSoup(html_doc.text, parser)
 
     def scrape(self, parser: Union[str, None] = None) -> Any: