From 14d0e0ee41e7b60e6835c1684e14916ddd86e0f3 Mon Sep 17 00:00:00 2001 From: blob42 Date: Sun, 9 Apr 2023 04:40:26 +0200 Subject: [PATCH] fix: ReadTheDocs loader main content filter --- langchain/document_loaders/readthedocs.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/langchain/document_loaders/readthedocs.py b/langchain/document_loaders/readthedocs.py index a8ca2433..b35f2687 100644 --- a/langchain/document_loaders/readthedocs.py +++ b/langchain/document_loaders/readthedocs.py @@ -45,6 +45,10 @@ class ReadTheDocsLoader(BaseLoader): def _clean_data(data: str) -> str: soup = BeautifulSoup(data, **self.bs_kwargs) text = soup.find_all("main", {"id": "main-content"}) + + if len(text) == 0: + text = soup.find_all("div", {"role": "main"}) + if len(text) != 0: text = text[0].get_text() else: