From 06017ffd21f226c7871b8a8e5212d2de833bdd60 Mon Sep 17 00:00:00 2001 From: Alex Iribarren Date: Sat, 1 Apr 2023 21:40:21 +0200 Subject: [PATCH] Allow configuring content selector --- langchain/document_loaders/gitbook.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/langchain/document_loaders/gitbook.py b/langchain/document_loaders/gitbook.py index edb60ce1b4..f47c9dc1a2 100644 --- a/langchain/document_loaders/gitbook.py +++ b/langchain/document_loaders/gitbook.py @@ -18,6 +18,7 @@ class GitbookLoader(WebBaseLoader): web_page: str, load_all_paths: bool = False, base_url: Optional[str] = None, + content_selector: str = "main", ): """Initialize with web page and whether to load all paths. @@ -39,6 +40,7 @@ class GitbookLoader(WebBaseLoader): web_paths = web_page super().__init__(web_paths) self.load_all_paths = load_all_paths + self.content_selector = content_selector def load(self) -> List[Document]: """Fetch text from one single GitBook page.""" @@ -61,7 +63,7 @@ class GitbookLoader(WebBaseLoader): self, soup: Any, custom_url: Optional[str] = None ) -> Optional[Document]: """Fetch content from page and return Document.""" - page_content_raw = soup.find("main") + page_content_raw = soup.find(self.content_selector) if not page_content_raw: return None content = page_content_raw.get_text(separator="\n").strip()