From 45f05fc9392a74090dce8e60677a805c3edc2eb8 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Fri, 17 Mar 2023 08:02:44 -0700 Subject: [PATCH] Harrison/blackboard loader (#1737) Co-authored-by: Aidan Holland --- .../examples/blackboard.ipynb | 38 +++ .../document_loaders/how_to_guides.rst | 2 + langchain/document_loaders/__init__.py | 4 +- langchain/document_loaders/blackboard.py | 292 ++++++++++++++++++ .../{csv.py => csv_loader.py} | 0 langchain/document_loaders/youtube.py | 2 +- 6 files changed, 336 insertions(+), 2 deletions(-) create mode 100644 docs/modules/document_loaders/examples/blackboard.ipynb create mode 100644 langchain/document_loaders/blackboard.py rename langchain/document_loaders/{csv.py => csv_loader.py} (100%) diff --git a/docs/modules/document_loaders/examples/blackboard.ipynb b/docs/modules/document_loaders/examples/blackboard.ipynb new file mode 100644 index 00000000..8b0c815b --- /dev/null +++ b/docs/modules/document_loaders/examples/blackboard.ipynb @@ -0,0 +1,38 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Blackboard\n", + "\n", + "This covers how to load data from a Blackboard Learn instance." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import BlackboardLoader\n", + "\n", + "loader = BlackboardLoader(\n", + " blackboard_course_url=\"https://blackboard.example.com/webapps/blackboard/execute/announcement?method=search&context=course_entry&course_id=_123456_1\",\n", + " bbrouter=\"expires:12345...\",\n", + " load_all_recursively=True,\n", + ")\n", + "documents = loader.load()" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/modules/document_loaders/how_to_guides.rst b/docs/modules/document_loaders/how_to_guides.rst index 741c4fcb..20c3688b 100644 --- a/docs/modules/document_loaders/how_to_guides.rst +++ b/docs/modules/document_loaders/how_to_guides.rst @@ -59,6 +59,8 @@ There are a lot of different document loaders that LangChain supports. Below are `iFixit <./examples/ifixit.html>`_: A walkthrough of how to search and load data like guides, technical Q&A's, and device wikis from iFixit.com +`Blackboard <./examples/blackboard.html>`_: A walkthrough of how to load data from a Blackboard course. + .. toctree:: :maxdepth: 1 :glob: diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 78c07776..ec2058dc 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -2,9 +2,10 @@ from langchain.document_loaders.airbyte_json import AirbyteJSONLoader from langchain.document_loaders.azlyrics import AZLyricsLoader +from langchain.document_loaders.blackboard import BlackboardLoader from langchain.document_loaders.college_confidential import CollegeConfidentialLoader from langchain.document_loaders.conllu import CoNLLULoader -from langchain.document_loaders.csv import CSVLoader +from langchain.document_loaders.csv_loader import CSVLoader from langchain.document_loaders.directory import DirectoryLoader from langchain.document_loaders.docx import UnstructuredDocxLoader from langchain.document_loaders.email import UnstructuredEmailLoader @@ -104,4 +105,5 @@ __all__ = [ "GoogleApiYoutubeLoader", "GoogleApiClient", "CSVLoader", + "BlackboardLoader", ] diff --git a/langchain/document_loaders/blackboard.py b/langchain/document_loaders/blackboard.py new file mode 100644 index 00000000..f323105e --- /dev/null +++ b/langchain/document_loaders/blackboard.py @@ -0,0 +1,292 @@ +"""Loader that loads all documents from a blackboard course.""" +import contextlib +import re +from pathlib import Path +from typing import Any, List, Optional, Tuple +from urllib.parse import unquote + +from langchain.docstore.document import Document +from langchain.document_loaders import DirectoryLoader, PyPDFLoader +from langchain.document_loaders.web_base import WebBaseLoader + + +class BlackboardLoader(WebBaseLoader): + """Loader that loads all documents from a Blackboard course. + + This loader is not compatible with all Blackboard courses. It is only + compatible with courses that use the new Blackboard interface. + To use this loader, you must have the BbRouter cookie. You can get this + cookie by logging into the course and then copying the value of the + BbRouter cookie from the browser's developer tools. + + Example: + .. code-block:: python + + from langchain.document_loaders import BlackboardLoader + + loader = BlackboardLoader( + blackboard_course_url="https://blackboard.example.com/webapps/blackboard/execute/announcement?method=search&context=course_entry&course_id=_123456_1", + bbrouter="expires:12345...", + ) + documents = loader.load() + + """ + + base_url: str + folder_path: str + load_all_recursively: bool + + def __init__( + self, + blackboard_course_url: str, + bbrouter: str, + load_all_recursively: bool = True, + basic_auth: Optional[Tuple[str, str]] = None, + cookies: Optional[dict] = None, + ): + """Initialize with blackboard course url. + + The BbRouter cookie is required for most blackboard courses. + + Args: + blackboard_course_url: Blackboard course url. + bbrouter: BbRouter cookie. + load_all_recursively: If True, load all documents recursively. + basic_auth: Basic auth credentials. + cookies: Cookies. + + Raises: + ValueError: If blackboard course url is invalid. + """ + super().__init__(blackboard_course_url) + # Get base url + try: + self.base_url = blackboard_course_url.split("/webapps/blackboard")[0] + except IndexError: + raise ValueError( + "Invalid blackboard course url. " + "Please provide a url that starts with " + "https:///webapps/blackboard" + ) + if basic_auth is not None: + self.session.auth = basic_auth + # Combine cookies + if cookies is None: + cookies = {} + cookies.update({"BbRouter": bbrouter}) + self.session.cookies.update(cookies) + self.load_all_recursively = load_all_recursively + self.check_bs4() + + def check_bs4(self) -> None: + """Check if BeautifulSoup4 is installed. + + Raises: + ImportError: If BeautifulSoup4 is not installed. + """ + try: + import bs4 # noqa: F401 + except ImportError: + raise ImportError( + "BeautifulSoup4 is required for BlackboardLoader. " + "Please install it with `pip install beautifulsoup4`." + ) + + def load(self) -> List[Document]: + """Load data into document objects. + + Returns: + List of documents. + """ + if self.load_all_recursively: + soup_info = self.scrape() + self.folder_path = self._get_folder_path(soup_info) + relative_paths = self._get_paths(soup_info) + documents = [] + for path in relative_paths: + url = self.base_url + path + print(f"Fetching documents from {url}") + soup_info = self._scrape(url) + with contextlib.suppress(ValueError): + documents.extend(self._get_documents(soup_info)) + return documents + else: + print(f"Fetching documents from {self.web_path}") + soup_info = self.scrape() + self.folder_path = self._get_folder_path(soup_info) + return self._get_documents(soup_info) + + def _get_folder_path(self, soup: Any) -> str: + """Get the folder path to save the documents in. + + Args: + soup: BeautifulSoup4 soup object. + + Returns: + Folder path. + """ + # Get the course name + course_name = soup.find("span", {"id": "crumb_1"}) + if course_name is None: + raise ValueError("No course name found.") + course_name = course_name.text.strip() + # Prepare the folder path + course_name_clean = ( + unquote(course_name) + .replace(" ", "_") + .replace("/", "_") + .replace(":", "_") + .replace(",", "_") + .replace("?", "_") + .replace("'", "_") + .replace("!", "_") + .replace('"', "_") + ) + # Get the folder path + folder_path = Path(".") / course_name_clean + return str(folder_path) + + def _get_documents(self, soup: Any) -> List[Document]: + """Fetch content from page and return Documents. + + Args: + soup: BeautifulSoup4 soup object. + + Returns: + List of documents. + """ + attachments = self._get_attachments(soup) + self._download_attachments(attachments) + documents = self._load_documents() + return documents + + def _get_attachments(self, soup: Any) -> List[str]: + """Get all attachments from a page. + + Args: + soup: BeautifulSoup4 soup object. + + Returns: + List of attachments. + """ + from bs4 import BeautifulSoup, Tag + + # Get content list + content_list = soup.find("ul", {"class": "contentList"}) + if content_list is None: + raise ValueError("No content list found.") + content_list: BeautifulSoup # type: ignore + # Get all attachments + attachments = [] + for attachment in content_list.find_all("ul", {"class": "attachments"}): + attachment: Tag # type: ignore + for link in attachment.find_all("a"): + link: Tag # type: ignore + href = link.get("href") + # Only add if href is not None and does not start with # + if href is not None and not href.startswith("#"): + attachments.append(href) + return attachments + + def _download_attachments(self, attachments: List[str]) -> None: + """Download all attachments. + + Args: + attachments: List of attachments. + """ + # Make sure the folder exists + Path(self.folder_path).mkdir(parents=True, exist_ok=True) + # Download all attachments + for attachment in attachments: + self.download(attachment) + + def _load_documents(self) -> List[Document]: + """Load all documents in the folder. + + Returns: + List of documents. + """ + # Create the document loader + loader = DirectoryLoader( + path=self.folder_path, glob="*.pdf", loader_cls=PyPDFLoader # type: ignore + ) + # Load the documents + documents = loader.load() + # Return all documents + return documents + + def _get_paths(self, soup: Any) -> List[str]: + """Get all relative paths in the navbar.""" + relative_paths = [] + course_menu = soup.find("ul", {"class": "courseMenu"}) + if course_menu is None: + raise ValueError("No course menu found.") + for link in course_menu.find_all("a"): + href = link.get("href") + if href is not None and href.startswith("/"): + relative_paths.append(href) + return relative_paths + + def download(self, path: str) -> None: + """Download a file from a url. + + Args: + path: Path to the file. + """ + # Get the file content + response = self.session.get(self.base_url + path, allow_redirects=True) + # Get the filename + filename = self.parse_filename(response.url) + # Write the file to disk + with open(Path(self.folder_path) / filename, "wb") as f: + f.write(response.content) + + def parse_filename(self, url: str) -> str: + """Parse the filename from a url. + + Args: + url: Url to parse the filename from. + + Returns: + The filename. + """ + if (url_path := Path(url)) and url_path.suffix == ".pdf": + return url_path.name + else: + return self._parse_filename_from_url(url) + + def _parse_filename_from_url(self, url: str) -> str: + """Parse the filename from a url. + + Args: + url: Url to parse the filename from. + + Returns: + The filename. + + Raises: + ValueError: If the filename could not be parsed. + """ + filename_matches = re.search(r"filename%2A%3DUTF-8%27%27(.+)", url) + if filename_matches: + filename = filename_matches.group(1) + else: + raise ValueError(f"Could not parse filename from {url}") + if ".pdf" not in filename: + raise ValueError(f"Incorrect file type: {filename}") + filename = filename.split(".pdf")[0] + ".pdf" + filename = unquote(filename) + filename = filename.replace("%20", " ") + return filename + + +if __name__ == "__main__": + loader = BlackboardLoader( + "https:///webapps/blackboard/content/listContent.jsp?course_id=__1&content_id=__1&mode=reset", + "", + load_all_recursively=True, + ) + documents = loader.load() + print(f"Loaded {len(documents)} pages of PDFs from {loader.web_path}") diff --git a/langchain/document_loaders/csv.py b/langchain/document_loaders/csv_loader.py similarity index 100% rename from langchain/document_loaders/csv.py rename to langchain/document_loaders/csv_loader.py diff --git a/langchain/document_loaders/youtube.py b/langchain/document_loaders/youtube.py index 69b1e7a8..7ba5cb50 100644 --- a/langchain/document_loaders/youtube.py +++ b/langchain/document_loaders/youtube.py @@ -177,7 +177,7 @@ class GoogleApiYoutubeLoader(BaseLoader): As the service needs a google_api_client, you first have to initialize the GoogleApiClient. - Additonali you have to either provide a channel name or a list of videoids + Additionally you have to either provide a channel name or a list of videoids "https://developers.google.com/docs/api/quickstart/python"