@ -1,6 +1,6 @@
""" Loader that loads GitBook. """
""" Loader that loads GitBook. """
from typing import Any , List , Optional
from typing import Any , List , Optional
from urllib . parse import url parse
from urllib . parse import url join, url parse
from langchain . docstore . document import Document
from langchain . docstore . document import Document
from langchain . document_loaders . web_base import WebBaseLoader
from langchain . document_loaders . web_base import WebBaseLoader
@ -18,6 +18,7 @@ class GitbookLoader(WebBaseLoader):
web_page : str ,
web_page : str ,
load_all_paths : bool = False ,
load_all_paths : bool = False ,
base_url : Optional [ str ] = None ,
base_url : Optional [ str ] = None ,
content_selector : str = " main " ,
) :
) :
""" Initialize with web page and whether to load all paths.
""" Initialize with web page and whether to load all paths.
@ -39,6 +40,7 @@ class GitbookLoader(WebBaseLoader):
web_paths = web_page
web_paths = web_page
super ( ) . __init__ ( web_paths )
super ( ) . __init__ ( web_paths )
self . load_all_paths = load_all_paths
self . load_all_paths = load_all_paths
self . content_selector = content_selector
def load ( self ) - > List [ Document ] :
def load ( self ) - > List [ Document ] :
""" Fetch text from one single GitBook page. """
""" Fetch text from one single GitBook page. """
@ -47,18 +49,23 @@ class GitbookLoader(WebBaseLoader):
relative_paths = self . _get_paths ( soup_info )
relative_paths = self . _get_paths ( soup_info )
documents = [ ]
documents = [ ]
for path in relative_paths :
for path in relative_paths :
url = self . base_url + path
url = urljoin ( self . base_url , path )
print ( f " Fetching text from { url } " )
print ( f " Fetching text from { url } " )
soup_info = self . _scrape ( url )
soup_info = self . _scrape ( url )
documents . append ( self . _get_document ( soup_info , url ) )
documents . append ( self . _get_document ( soup_info , url ) )
return documents
return [ d for d in documents if d ]
else :
else :
soup_info = self . scrape ( )
soup_info = self . scrape ( )
return [ self . _get_document ( soup_info , self . web_path ) ]
documents = [ self . _get_document ( soup_info , self . web_path ) ]
return [ d for d in documents if d ]
def _get_document ( self , soup : Any , custom_url : Optional [ str ] = None ) - > Document :
def _get_document (
self , soup : Any , custom_url : Optional [ str ] = None
) - > Optional [ Document ] :
""" Fetch content from page and return Document. """
""" Fetch content from page and return Document. """
page_content_raw = soup . find ( " main " )
page_content_raw = soup . find ( self . content_selector )
if not page_content_raw :
return None
content = page_content_raw . get_text ( separator = " \n " ) . strip ( )
content = page_content_raw . get_text ( separator = " \n " ) . strip ( )
title_if_exists = page_content_raw . find ( " h1 " )
title_if_exists = page_content_raw . find ( " h1 " )
title = title_if_exists . text if title_if_exists else " "
title = title_if_exists . text if title_if_exists else " "