add gitbook document loader (#1180)

Added a GitBook document loader. It lets you both, (1) fetch text from any single GitBook page, or (2) fetch all relative paths and return their respective content in Documents. I've modified the `scrape` method in the `WebBaseLoader` to accept custom web paths if given, but happy to remove it and move that logic into the `GitbookLoader` itself.
1 year ago · 23243ae69c
parent 13ba0177d0
commit 23243ae69c
5 changed files with 245 additions and 3 deletions
--- a/docs/modules/document_loaders/examples/gitbook.ipynb
+++ b/docs/modules/document_loaders/examples/gitbook.ipynb
@ -0,0 +1,185 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "4babfba5",
+   "metadata": {},
+   "source": [
+    "# GitBook\n",
+    "How to pull page data from any GitBook."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "ff49b177",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import GitbookLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "849a8d52",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = GitbookLoader(\"https://docs.gitbook.com\")"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "65d5ddce",
+   "metadata": {},
+   "source": [
+    "### Load from single GitBook page"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "c2826836",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "page_data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "fefa2adc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Introduction to GitBook\\nGitBook is a modern documentation platform where teams can document everything from products to internal knowledge bases and APIs.\\nWe want to help \\nteams to work more efficiently\\n by creating a simple yet powerful platform for them to \\nshare their knowledge\\n.\\nOur mission is to make a \\nuser-friendly\\n and \\ncollaborative\\n product for everyone to create, edit and share knowledge through documentation.\\nPublish your documentation in 5 easy steps\\nImport\\n\\nMove your existing content to GitBook with ease.\\nGit Sync\\n\\nBenefit from our bi-directional synchronisation with GitHub and GitLab.\\nOrganise your content\\n\\nCreate pages and spaces and organize them into collections\\nCollaborate\\n\\nInvite other users and collaborate asynchronously with ease.\\nPublish your docs\\n\\nShare your documentation with selected users or with everyone.\\nNext\\n - Getting started\\nOverview\\nLast modified \\n3mo ago', lookup_str='', metadata={'source': 'https://docs.gitbook.com', 'title': 'Introduction to GitBook'}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "page_data"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "c325048c",
+   "metadata": {},
+   "source": [
+    "### Load from all paths in a given GitBook\n",
+    "For this to work, the GitbookLoader needs to be initialized with the root path (`https://docs.gitbook.com` in this example)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "938ff4ee",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fetching text from https://docs.gitbook.com/\n",
+      "Fetching text from https://docs.gitbook.com/getting-started/overview\n",
+      "Fetching text from https://docs.gitbook.com/getting-started/import\n",
+      "Fetching text from https://docs.gitbook.com/getting-started/git-sync\n",
+      "Fetching text from https://docs.gitbook.com/getting-started/content-structure\n",
+      "Fetching text from https://docs.gitbook.com/getting-started/collaboration\n",
+      "Fetching text from https://docs.gitbook.com/getting-started/publishing\n",
+      "Fetching text from https://docs.gitbook.com/tour/quick-find\n",
+      "Fetching text from https://docs.gitbook.com/tour/editor\n",
+      "Fetching text from https://docs.gitbook.com/tour/customization\n",
+      "Fetching text from https://docs.gitbook.com/tour/member-management\n",
+      "Fetching text from https://docs.gitbook.com/tour/pdf-export\n",
+      "Fetching text from https://docs.gitbook.com/tour/activity-history\n",
+      "Fetching text from https://docs.gitbook.com/tour/insights\n",
+      "Fetching text from https://docs.gitbook.com/tour/notifications\n",
+      "Fetching text from https://docs.gitbook.com/tour/internationalization\n",
+      "Fetching text from https://docs.gitbook.com/tour/keyboard-shortcuts\n",
+      "Fetching text from https://docs.gitbook.com/tour/seo\n",
+      "Fetching text from https://docs.gitbook.com/advanced-guides/custom-domain\n",
+      "Fetching text from https://docs.gitbook.com/advanced-guides/advanced-sharing-and-security\n",
+      "Fetching text from https://docs.gitbook.com/advanced-guides/integrations\n",
+      "Fetching text from https://docs.gitbook.com/billing-and-admin/account-settings\n",
+      "Fetching text from https://docs.gitbook.com/billing-and-admin/plans\n",
+      "Fetching text from https://docs.gitbook.com/troubleshooting/faqs\n",
+      "Fetching text from https://docs.gitbook.com/troubleshooting/hard-refresh\n",
+      "Fetching text from https://docs.gitbook.com/troubleshooting/report-bugs\n",
+      "Fetching text from https://docs.gitbook.com/troubleshooting/connectivity-issues\n",
+      "Fetching text from https://docs.gitbook.com/troubleshooting/support\n"
+     ]
+    }
+   ],
+   "source": [
+    "all_pages_data = loader.load_from_all_paths()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "db92fc39",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "fetched 28 documents.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "Document(page_content=\"Import\\nFind out how to easily migrate your existing documentation and which formats are supported.\\nThe import function allows you to migrate and unify existing documentation in GitBook. You can choose to import single or multiple pages although limits apply. \\nPermissions\\nAll members with editor permission or above can use the import feature.\\nSupported formats\\nGitBook supports imports from websites or files that are:\\nMarkdown (.md or .markdown)\\nHTML (.html)\\nMicrosoft Word (.docx).\\nWe also support import from:\\nConfluence\\nNotion\\nGitHub Wiki\\nQuip\\nDropbox Paper\\nGoogle Docs\\nYou can also upload a ZIP\\n \\ncontaining HTML or Markdown files when \\nimporting multiple pages.\\nNote: this feature is in beta.\\nFeel free to suggest import sources we don't support yet and \\nlet us know\\n if you have any issues.\\nImport panel\\nWhen you create a new space, you'll have the option to import content straight away:\\nThe new page menu\\nImport a page or subpage by selecting \\nImport Page\\n from the New Page menu, or \\nImport Subpage\\n in the page action menu, found in the table of contents:\\nImport from the page action menu\\nWhen you choose your input source, instructions will explain how to proceed.\\nAlthough GitBook supports importing content from different kinds of sources, the end result might be different from your source due to differences in product features and document format.\\nLimits\\nGitBook currently has the following limits for imported content:\\nThe maximum number of pages that can be uploaded in a single import is \\n20.\\nThe maximum number of files (images etc.) that can be uploaded in a single import is \\n20.\\nGetting started - \\nPrevious\\nOverview\\nNext\\n - Getting started\\nGit Sync\\nLast modified \\n4mo ago\", lookup_str='', metadata={'source': 'https://docs.gitbook.com/getting-started/import', 'title': 'Import'}, lookup_index=0)"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "print(f\"fetched {len(all_pages_data)} documents.\")\n",
+    "# show second document\n",
+    "all_pages_data[2]"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": ".venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "2d002ec47225e662695b764370d7966aa11eeb4302edc2f497bbf96d49c8f899"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/modules/document_loaders/how_to_guides.rst
+++ b/docs/modules/document_loaders/how_to_guides.rst
@ -33,6 +33,8 @@ There are a lot of different document loaders that LangChain supports. Below are

 `Hacker News <./examples/hn.html>`_: A walkthrough of how to load a Hacker News page.

+`GitBook <./examples/gitbook.html>`_: A walkthrough of how to load a GitBook page.
+
 `s3 File <./examples/s3_file.html>`_: A walkthrough of how to load a file from s3.

 `s3 Directory <./examples/s3_directory.html>`_: A walkthrough of how to load all files in a directory from s3.
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -9,6 +9,7 @@ from langchain.document_loaders.email import UnstructuredEmailLoader
 from langchain.document_loaders.evernote import EverNoteLoader
 from langchain.document_loaders.gcs_directory import GCSDirectoryLoader
 from langchain.document_loaders.gcs_file import GCSFileLoader
+from langchain.document_loaders.gitbook import GitbookLoader
 from langchain.document_loaders.googledrive import GoogleDriveLoader
 from langchain.document_loaders.gutenberg import GutenbergLoader
 from langchain.document_loaders.hn import HNLoader
@ -50,6 +51,7 @@ __all__ = [
    "S3FileLoader",
    "TextLoader",
    "HNLoader",
+    "GitbookLoader",
    "S3DirectoryLoader",
    "GCSFileLoader",
    "GCSDirectoryLoader",
--- a/langchain/document_loaders/gitbook.py
+++ b/langchain/document_loaders/gitbook.py
@ -0,0 +1,52 @@
+"""Loader that loads GitBook."""
+from typing import Any, List, Optional
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.web_base import WebBaseLoader
+
+
+class GitbookLoader(WebBaseLoader):
+    """Load GitBook data.
+
+    1. load from either a single page, or
+    2. load all (relative) paths in the navbar.
+    """
+
+    def load(self, custom_web_path: Optional[str] = None) -> List[Document]:
+        """Fetch text from one single GitBook page."""
+        soup_info = self.scrape(custom_web_path)
+        url = custom_web_path if custom_web_path else self.web_path
+        return [self._get_document(soup_info, url)]
+
+    def load_from_all_paths(self) -> List[Document]:
+        """Fetch text from all pages in the navbar.
+
+        Make sure the initialized web_path is the root of the GitBook
+        """
+        soup_info = self.scrape()
+        relative_paths = self._get_paths(soup_info)
+        documents = []
+        for path in relative_paths:
+            url = self.web_path + path
+            print(f"Fetching text from {url}")
+            documents += self.load(url)
+        return documents
+
+    def _get_document(self, soup: Any, custom_url: Optional[str] = None) -> Document:
+        """Fetch content from page and return Document."""
+        page_content_raw = soup.find("main")
+        content = page_content_raw.get_text(separator="\n").strip()
+        title_if_exists = page_content_raw.find("h1")
+        title = title_if_exists.text if title_if_exists else ""
+        metadata = {
+            "source": custom_url if custom_url else self.web_path,
+            "title": title,
+        }
+        return Document(page_content=content, metadata=metadata)
+
+    def _get_paths(self, soup: Any) -> List[str]:
+        """Fetch all relative paths in the navbar."""
+        nav = soup.find("nav")
+        links = nav.findAll("a")
+        # only return relative links
+        return [link.get("href") for link in links if link.get("href")[0] == "/"]
--- a/langchain/document_loaders/web_base.py
+++ b/langchain/document_loaders/web_base.py
@ -1,5 +1,5 @@
 """Web base loader class."""
-from typing import Any, List
+from typing import Any, List, Optional

 import requests

@ -14,11 +14,12 @@ class WebBaseLoader(BaseLoader):
        """Initialize with webpage path."""
        self.web_path = web_path

-    def scrape(self) -> Any:
+    def scrape(self, custom_web_path: Optional[str] = None) -> Any:
        """Scrape data from webpage and return it in BeautifulSoup format."""
        from bs4 import BeautifulSoup

-        html_doc = requests.get(self.web_path)
+        url = custom_web_path if custom_web_path else self.web_path
+        html_doc = requests.get(url)
        soup = BeautifulSoup(html_doc.text, "html.parser")
        return soup