From 54b1645d138342048ff7a8185064dfeed8244ca9 Mon Sep 17 00:00:00 2001
From: blob42 <contact@blob42.xyz>
Date: Sun, 9 Apr 2023 19:51:56 +0000
Subject: [PATCH] fix: ReadTheDocs loader main content filter (#2609)

It seems the main element wrapper changed in ReadTheDocs website or for
some reason it's different for me ?

This adds an extra filter for the main content wrapper if the first one
returns no text.


![2023-04-09-043315_1178x873_scrot](https://user-images.githubusercontent.com/210457/230751369-24b69cb9-1601-4540-b5f3-d115165f55f6.jpg)

Co-authored-by: blob42 <spike@w530>
---
 langchain/document_loaders/readthedocs.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/langchain/document_loaders/readthedocs.py b/langchain/document_loaders/readthedocs.py
index a8ca2433..b35f2687 100644
--- a/langchain/document_loaders/readthedocs.py
+++ b/langchain/document_loaders/readthedocs.py
@@ -45,6 +45,10 @@ class ReadTheDocsLoader(BaseLoader):
         def _clean_data(data: str) -> str:
             soup = BeautifulSoup(data, **self.bs_kwargs)
             text = soup.find_all("main", {"id": "main-content"})
+
+            if len(text) == 0:
+                text = soup.find_all("div", {"role": "main"})
+
             if len(text) != 0:
                 text = text[0].get_text()
             else: