mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Gitbook enhancements (#2279)
The gitbook importer had some issues while trying to ingest a particular
site, these commits allowed it to work as expected. The last commit
(06017ff
) is to open the door to extending this class for other
documentation formats (which will come in a future PR).
This commit is contained in:
parent
58a93f88da
commit
aecd1c8ee3
@ -1,6 +1,6 @@
|
|||||||
"""Loader that loads GitBook."""
|
"""Loader that loads GitBook."""
|
||||||
from typing import Any, List, Optional
|
from typing import Any, List, Optional
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
from langchain.docstore.document import Document
|
from langchain.docstore.document import Document
|
||||||
from langchain.document_loaders.web_base import WebBaseLoader
|
from langchain.document_loaders.web_base import WebBaseLoader
|
||||||
@ -18,6 +18,7 @@ class GitbookLoader(WebBaseLoader):
|
|||||||
web_page: str,
|
web_page: str,
|
||||||
load_all_paths: bool = False,
|
load_all_paths: bool = False,
|
||||||
base_url: Optional[str] = None,
|
base_url: Optional[str] = None,
|
||||||
|
content_selector: str = "main",
|
||||||
):
|
):
|
||||||
"""Initialize with web page and whether to load all paths.
|
"""Initialize with web page and whether to load all paths.
|
||||||
|
|
||||||
@ -39,6 +40,7 @@ class GitbookLoader(WebBaseLoader):
|
|||||||
web_paths = web_page
|
web_paths = web_page
|
||||||
super().__init__(web_paths)
|
super().__init__(web_paths)
|
||||||
self.load_all_paths = load_all_paths
|
self.load_all_paths = load_all_paths
|
||||||
|
self.content_selector = content_selector
|
||||||
|
|
||||||
def load(self) -> List[Document]:
|
def load(self) -> List[Document]:
|
||||||
"""Fetch text from one single GitBook page."""
|
"""Fetch text from one single GitBook page."""
|
||||||
@ -47,18 +49,23 @@ class GitbookLoader(WebBaseLoader):
|
|||||||
relative_paths = self._get_paths(soup_info)
|
relative_paths = self._get_paths(soup_info)
|
||||||
documents = []
|
documents = []
|
||||||
for path in relative_paths:
|
for path in relative_paths:
|
||||||
url = self.base_url + path
|
url = urljoin(self.base_url, path)
|
||||||
print(f"Fetching text from {url}")
|
print(f"Fetching text from {url}")
|
||||||
soup_info = self._scrape(url)
|
soup_info = self._scrape(url)
|
||||||
documents.append(self._get_document(soup_info, url))
|
documents.append(self._get_document(soup_info, url))
|
||||||
return documents
|
return [d for d in documents if d]
|
||||||
else:
|
else:
|
||||||
soup_info = self.scrape()
|
soup_info = self.scrape()
|
||||||
return [self._get_document(soup_info, self.web_path)]
|
documents = [self._get_document(soup_info, self.web_path)]
|
||||||
|
return [d for d in documents if d]
|
||||||
|
|
||||||
def _get_document(self, soup: Any, custom_url: Optional[str] = None) -> Document:
|
def _get_document(
|
||||||
|
self, soup: Any, custom_url: Optional[str] = None
|
||||||
|
) -> Optional[Document]:
|
||||||
"""Fetch content from page and return Document."""
|
"""Fetch content from page and return Document."""
|
||||||
page_content_raw = soup.find("main")
|
page_content_raw = soup.find(self.content_selector)
|
||||||
|
if not page_content_raw:
|
||||||
|
return None
|
||||||
content = page_content_raw.get_text(separator="\n").strip()
|
content = page_content_raw.get_text(separator="\n").strip()
|
||||||
title_if_exists = page_content_raw.find("h1")
|
title_if_exists = page_content_raw.find("h1")
|
||||||
title = title_if_exists.text if title_if_exists else ""
|
title = title_if_exists.text if title_if_exists else ""
|
||||||
|
Loading…
Reference in New Issue
Block a user