Harrison/blackboard loader (#1737)

Co-authored-by: Aidan Holland <thehappydinoa@gmail.com>
2023-03-17 08:02:44 -07:00 · 2023-03-17 08:02:44 -07:00 · 45f05fc939
commit 45f05fc939
parent cf9c3f54f7
6 changed files with 336 additions and 2 deletions
--- a/docs/modules/document_loaders/examples/blackboard.ipynb
+++ b/docs/modules/document_loaders/examples/blackboard.ipynb
@ -0,0 +1,38 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Blackboard\n",
+    "\n",
+    "This covers how to load data from a Blackboard Learn instance."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import BlackboardLoader\n",
+    "\n",
+    "loader = BlackboardLoader(\n",
+    "    blackboard_course_url=\"https://blackboard.example.com/webapps/blackboard/execute/announcement?method=search&context=course_entry&course_id=_123456_1\",\n",
+    "    bbrouter=\"expires:12345...\",\n",
+    "    load_all_recursively=True,\n",
+    ")\n",
+    "documents = loader.load()"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/docs/modules/document_loaders/how_to_guides.rst
+++ b/docs/modules/document_loaders/how_to_guides.rst
@ -59,6 +59,8 @@ There are a lot of different document loaders that LangChain supports. Below are

 `iFixit <./examples/ifixit.html>`_: A walkthrough of how to search and load data like guides, technical Q&A's, and device wikis from iFixit.com

+`Blackboard <./examples/blackboard.html>`_: A walkthrough of how to load data from a Blackboard course.
+
 .. toctree::
   :maxdepth: 1
   :glob:
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -2,9 +2,10 @@

 from langchain.document_loaders.airbyte_json import AirbyteJSONLoader
 from langchain.document_loaders.azlyrics import AZLyricsLoader
+from langchain.document_loaders.blackboard import BlackboardLoader
 from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
 from langchain.document_loaders.conllu import CoNLLULoader
-from langchain.document_loaders.csv import CSVLoader
+from langchain.document_loaders.csv_loader import CSVLoader
 from langchain.document_loaders.directory import DirectoryLoader
 from langchain.document_loaders.docx import UnstructuredDocxLoader
 from langchain.document_loaders.email import UnstructuredEmailLoader
@ -104,4 +105,5 @@ __all__ = [
    "GoogleApiYoutubeLoader",
    "GoogleApiClient",
    "CSVLoader",
+    "BlackboardLoader",
 ]
--- a/langchain/document_loaders/blackboard.py
+++ b/langchain/document_loaders/blackboard.py
@ -0,0 +1,292 @@
+"""Loader that loads all documents from a blackboard course."""
+import contextlib
+import re
+from pathlib import Path
+from typing import Any, List, Optional, Tuple
+from urllib.parse import unquote
+
+from langchain.docstore.document import Document
+from langchain.document_loaders import DirectoryLoader, PyPDFLoader
+from langchain.document_loaders.web_base import WebBaseLoader
+
+
+class BlackboardLoader(WebBaseLoader):
+    """Loader that loads all documents from a Blackboard course.
+
+    This loader is not compatible with all Blackboard courses. It is only
+    compatible with courses that use the new Blackboard interface.
+    To use this loader, you must have the BbRouter cookie. You can get this
+    cookie by logging into the course and then copying the value of the
+    BbRouter cookie from the browser's developer tools.
+
+    Example:
+        .. code-block:: python
+
+            from langchain.document_loaders import BlackboardLoader
+
+            loader = BlackboardLoader(
+                blackboard_course_url="https://blackboard.example.com/webapps/blackboard/execute/announcement?method=search&context=course_entry&course_id=_123456_1",
+                bbrouter="expires:12345...",
+            )
+            documents = loader.load()
+
+    """
+
+    base_url: str
+    folder_path: str
+    load_all_recursively: bool
+
+    def __init__(
+        self,
+        blackboard_course_url: str,
+        bbrouter: str,
+        load_all_recursively: bool = True,
+        basic_auth: Optional[Tuple[str, str]] = None,
+        cookies: Optional[dict] = None,
+    ):
+        """Initialize with blackboard course url.
+
+        The BbRouter cookie is required for most blackboard courses.
+
+        Args:
+            blackboard_course_url: Blackboard course url.
+            bbrouter: BbRouter cookie.
+            load_all_recursively: If True, load all documents recursively.
+            basic_auth: Basic auth credentials.
+            cookies: Cookies.
+
+        Raises:
+            ValueError: If blackboard course url is invalid.
+        """
+        super().__init__(blackboard_course_url)
+        # Get base url
+        try:
+            self.base_url = blackboard_course_url.split("/webapps/blackboard")[0]
+        except IndexError:
+            raise ValueError(
+                "Invalid blackboard course url. "
+                "Please provide a url that starts with "
+                "https://<blackboard_url>/webapps/blackboard"
+            )
+        if basic_auth is not None:
+            self.session.auth = basic_auth
+        # Combine cookies
+        if cookies is None:
+            cookies = {}
+        cookies.update({"BbRouter": bbrouter})
+        self.session.cookies.update(cookies)
+        self.load_all_recursively = load_all_recursively
+        self.check_bs4()
+
+    def check_bs4(self) -> None:
+        """Check if BeautifulSoup4 is installed.
+
+        Raises:
+            ImportError: If BeautifulSoup4 is not installed.
+        """
+        try:
+            import bs4  # noqa: F401
+        except ImportError:
+            raise ImportError(
+                "BeautifulSoup4 is required for BlackboardLoader. "
+                "Please install it with `pip install beautifulsoup4`."
+            )
+
+    def load(self) -> List[Document]:
+        """Load data into document objects.
+
+        Returns:
+            List of documents.
+        """
+        if self.load_all_recursively:
+            soup_info = self.scrape()
+            self.folder_path = self._get_folder_path(soup_info)
+            relative_paths = self._get_paths(soup_info)
+            documents = []
+            for path in relative_paths:
+                url = self.base_url + path
+                print(f"Fetching documents from {url}")
+                soup_info = self._scrape(url)
+                with contextlib.suppress(ValueError):
+                    documents.extend(self._get_documents(soup_info))
+            return documents
+        else:
+            print(f"Fetching documents from {self.web_path}")
+            soup_info = self.scrape()
+            self.folder_path = self._get_folder_path(soup_info)
+            return self._get_documents(soup_info)
+
+    def _get_folder_path(self, soup: Any) -> str:
+        """Get the folder path to save the documents in.
+
+        Args:
+            soup: BeautifulSoup4 soup object.
+
+        Returns:
+            Folder path.
+        """
+        # Get the course name
+        course_name = soup.find("span", {"id": "crumb_1"})
+        if course_name is None:
+            raise ValueError("No course name found.")
+        course_name = course_name.text.strip()
+        # Prepare the folder path
+        course_name_clean = (
+            unquote(course_name)
+            .replace(" ", "_")
+            .replace("/", "_")
+            .replace(":", "_")
+            .replace(",", "_")
+            .replace("?", "_")
+            .replace("'", "_")
+            .replace("!", "_")
+            .replace('"', "_")
+        )
+        # Get the folder path
+        folder_path = Path(".") / course_name_clean
+        return str(folder_path)
+
+    def _get_documents(self, soup: Any) -> List[Document]:
+        """Fetch content from page and return Documents.
+
+        Args:
+            soup: BeautifulSoup4 soup object.
+
+        Returns:
+            List of documents.
+        """
+        attachments = self._get_attachments(soup)
+        self._download_attachments(attachments)
+        documents = self._load_documents()
+        return documents
+
+    def _get_attachments(self, soup: Any) -> List[str]:
+        """Get all attachments from a page.
+
+        Args:
+            soup: BeautifulSoup4 soup object.
+
+        Returns:
+            List of attachments.
+        """
+        from bs4 import BeautifulSoup, Tag
+
+        # Get content list
+        content_list = soup.find("ul", {"class": "contentList"})
+        if content_list is None:
+            raise ValueError("No content list found.")
+        content_list: BeautifulSoup  # type: ignore
+        # Get all attachments
+        attachments = []
+        for attachment in content_list.find_all("ul", {"class": "attachments"}):
+            attachment: Tag  # type: ignore
+            for link in attachment.find_all("a"):
+                link: Tag  # type: ignore
+                href = link.get("href")
+                # Only add if href is not None and does not start with #
+                if href is not None and not href.startswith("#"):
+                    attachments.append(href)
+        return attachments
+
+    def _download_attachments(self, attachments: List[str]) -> None:
+        """Download all attachments.
+
+        Args:
+            attachments: List of attachments.
+        """
+        # Make sure the folder exists
+        Path(self.folder_path).mkdir(parents=True, exist_ok=True)
+        # Download all attachments
+        for attachment in attachments:
+            self.download(attachment)
+
+    def _load_documents(self) -> List[Document]:
+        """Load all documents in the folder.
+
+        Returns:
+            List of documents.
+        """
+        # Create the document loader
+        loader = DirectoryLoader(
+            path=self.folder_path, glob="*.pdf", loader_cls=PyPDFLoader  # type: ignore
+        )
+        # Load the documents
+        documents = loader.load()
+        # Return all documents
+        return documents
+
+    def _get_paths(self, soup: Any) -> List[str]:
+        """Get all relative paths in the navbar."""
+        relative_paths = []
+        course_menu = soup.find("ul", {"class": "courseMenu"})
+        if course_menu is None:
+            raise ValueError("No course menu found.")
+        for link in course_menu.find_all("a"):
+            href = link.get("href")
+            if href is not None and href.startswith("/"):
+                relative_paths.append(href)
+        return relative_paths
+
+    def download(self, path: str) -> None:
+        """Download a file from a url.
+
+        Args:
+            path: Path to the file.
+        """
+        # Get the file content
+        response = self.session.get(self.base_url + path, allow_redirects=True)
+        # Get the filename
+        filename = self.parse_filename(response.url)
+        # Write the file to disk
+        with open(Path(self.folder_path) / filename, "wb") as f:
+            f.write(response.content)
+
+    def parse_filename(self, url: str) -> str:
+        """Parse the filename from a url.
+
+        Args:
+            url: Url to parse the filename from.
+
+        Returns:
+            The filename.
+        """
+        if (url_path := Path(url)) and url_path.suffix == ".pdf":
+            return url_path.name
+        else:
+            return self._parse_filename_from_url(url)
+
+    def _parse_filename_from_url(self, url: str) -> str:
+        """Parse the filename from a url.
+
+        Args:
+            url: Url to parse the filename from.
+
+        Returns:
+            The filename.
+
+        Raises:
+            ValueError: If the filename could not be parsed.
+        """
+        filename_matches = re.search(r"filename%2A%3DUTF-8%27%27(.+)", url)
+        if filename_matches:
+            filename = filename_matches.group(1)
+        else:
+            raise ValueError(f"Could not parse filename from {url}")
+        if ".pdf" not in filename:
+            raise ValueError(f"Incorrect file type: {filename}")
+        filename = filename.split(".pdf")[0] + ".pdf"
+        filename = unquote(filename)
+        filename = filename.replace("%20", " ")
+        return filename
+
+
+if __name__ == "__main__":
+    loader = BlackboardLoader(
+        "https://<YOUR BLACKBOARD URL"
+        " HERE>/webapps/blackboard/content/listContent.jsp?course_id=_<YOUR COURSE ID"
+        " HERE>_1&content_id=_<YOUR CONTENT ID HERE>_1&mode=reset",
+        "<YOUR BBROUTER COOKIE HERE>",
+        load_all_recursively=True,
+    )
+    documents = loader.load()
+    print(f"Loaded {len(documents)} pages of PDFs from {loader.web_path}")
--- a/langchain/document_loaders/csv_loader.py
+++ b/langchain/document_loaders/csv_loader.py
--- a/langchain/document_loaders/youtube.py
+++ b/langchain/document_loaders/youtube.py
@ -177,7 +177,7 @@ class GoogleApiYoutubeLoader(BaseLoader):
    As the service needs a google_api_client, you first have to initialize
    the GoogleApiClient.

-    Additonali you have to either provide a channel name or a list of videoids
+    Additionally you have to either provide a channel name or a list of videoids
    "https://developers.google.com/docs/api/quickstart/python"