forked from Archives/langchain
Harrison/blackboard loader (#1737)
Co-authored-by: Aidan Holland <thehappydinoa@gmail.com>
This commit is contained in:
parent
cf9c3f54f7
commit
45f05fc939
38
docs/modules/document_loaders/examples/blackboard.ipynb
Normal file
38
docs/modules/document_loaders/examples/blackboard.ipynb
Normal file
@ -0,0 +1,38 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"attachments": {},
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Blackboard\n",
|
||||
"\n",
|
||||
"This covers how to load data from a Blackboard Learn instance."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import BlackboardLoader\n",
|
||||
"\n",
|
||||
"loader = BlackboardLoader(\n",
|
||||
" blackboard_course_url=\"https://blackboard.example.com/webapps/blackboard/execute/announcement?method=search&context=course_entry&course_id=_123456_1\",\n",
|
||||
" bbrouter=\"expires:12345...\",\n",
|
||||
" load_all_recursively=True,\n",
|
||||
")\n",
|
||||
"documents = loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -59,6 +59,8 @@ There are a lot of different document loaders that LangChain supports. Below are
|
||||
|
||||
`iFixit <./examples/ifixit.html>`_: A walkthrough of how to search and load data like guides, technical Q&A's, and device wikis from iFixit.com
|
||||
|
||||
`Blackboard <./examples/blackboard.html>`_: A walkthrough of how to load data from a Blackboard course.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 1
|
||||
:glob:
|
||||
|
@ -2,9 +2,10 @@
|
||||
|
||||
from langchain.document_loaders.airbyte_json import AirbyteJSONLoader
|
||||
from langchain.document_loaders.azlyrics import AZLyricsLoader
|
||||
from langchain.document_loaders.blackboard import BlackboardLoader
|
||||
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
||||
from langchain.document_loaders.conllu import CoNLLULoader
|
||||
from langchain.document_loaders.csv import CSVLoader
|
||||
from langchain.document_loaders.csv_loader import CSVLoader
|
||||
from langchain.document_loaders.directory import DirectoryLoader
|
||||
from langchain.document_loaders.docx import UnstructuredDocxLoader
|
||||
from langchain.document_loaders.email import UnstructuredEmailLoader
|
||||
@ -104,4 +105,5 @@ __all__ = [
|
||||
"GoogleApiYoutubeLoader",
|
||||
"GoogleApiClient",
|
||||
"CSVLoader",
|
||||
"BlackboardLoader",
|
||||
]
|
||||
|
292
langchain/document_loaders/blackboard.py
Normal file
292
langchain/document_loaders/blackboard.py
Normal file
@ -0,0 +1,292 @@
|
||||
"""Loader that loads all documents from a blackboard course."""
|
||||
import contextlib
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Optional, Tuple
|
||||
from urllib.parse import unquote
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders import DirectoryLoader, PyPDFLoader
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class BlackboardLoader(WebBaseLoader):
|
||||
"""Loader that loads all documents from a Blackboard course.
|
||||
|
||||
This loader is not compatible with all Blackboard courses. It is only
|
||||
compatible with courses that use the new Blackboard interface.
|
||||
To use this loader, you must have the BbRouter cookie. You can get this
|
||||
cookie by logging into the course and then copying the value of the
|
||||
BbRouter cookie from the browser's developer tools.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.document_loaders import BlackboardLoader
|
||||
|
||||
loader = BlackboardLoader(
|
||||
blackboard_course_url="https://blackboard.example.com/webapps/blackboard/execute/announcement?method=search&context=course_entry&course_id=_123456_1",
|
||||
bbrouter="expires:12345...",
|
||||
)
|
||||
documents = loader.load()
|
||||
|
||||
"""
|
||||
|
||||
base_url: str
|
||||
folder_path: str
|
||||
load_all_recursively: bool
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
blackboard_course_url: str,
|
||||
bbrouter: str,
|
||||
load_all_recursively: bool = True,
|
||||
basic_auth: Optional[Tuple[str, str]] = None,
|
||||
cookies: Optional[dict] = None,
|
||||
):
|
||||
"""Initialize with blackboard course url.
|
||||
|
||||
The BbRouter cookie is required for most blackboard courses.
|
||||
|
||||
Args:
|
||||
blackboard_course_url: Blackboard course url.
|
||||
bbrouter: BbRouter cookie.
|
||||
load_all_recursively: If True, load all documents recursively.
|
||||
basic_auth: Basic auth credentials.
|
||||
cookies: Cookies.
|
||||
|
||||
Raises:
|
||||
ValueError: If blackboard course url is invalid.
|
||||
"""
|
||||
super().__init__(blackboard_course_url)
|
||||
# Get base url
|
||||
try:
|
||||
self.base_url = blackboard_course_url.split("/webapps/blackboard")[0]
|
||||
except IndexError:
|
||||
raise ValueError(
|
||||
"Invalid blackboard course url. "
|
||||
"Please provide a url that starts with "
|
||||
"https://<blackboard_url>/webapps/blackboard"
|
||||
)
|
||||
if basic_auth is not None:
|
||||
self.session.auth = basic_auth
|
||||
# Combine cookies
|
||||
if cookies is None:
|
||||
cookies = {}
|
||||
cookies.update({"BbRouter": bbrouter})
|
||||
self.session.cookies.update(cookies)
|
||||
self.load_all_recursively = load_all_recursively
|
||||
self.check_bs4()
|
||||
|
||||
def check_bs4(self) -> None:
|
||||
"""Check if BeautifulSoup4 is installed.
|
||||
|
||||
Raises:
|
||||
ImportError: If BeautifulSoup4 is not installed.
|
||||
"""
|
||||
try:
|
||||
import bs4 # noqa: F401
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"BeautifulSoup4 is required for BlackboardLoader. "
|
||||
"Please install it with `pip install beautifulsoup4`."
|
||||
)
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load data into document objects.
|
||||
|
||||
Returns:
|
||||
List of documents.
|
||||
"""
|
||||
if self.load_all_recursively:
|
||||
soup_info = self.scrape()
|
||||
self.folder_path = self._get_folder_path(soup_info)
|
||||
relative_paths = self._get_paths(soup_info)
|
||||
documents = []
|
||||
for path in relative_paths:
|
||||
url = self.base_url + path
|
||||
print(f"Fetching documents from {url}")
|
||||
soup_info = self._scrape(url)
|
||||
with contextlib.suppress(ValueError):
|
||||
documents.extend(self._get_documents(soup_info))
|
||||
return documents
|
||||
else:
|
||||
print(f"Fetching documents from {self.web_path}")
|
||||
soup_info = self.scrape()
|
||||
self.folder_path = self._get_folder_path(soup_info)
|
||||
return self._get_documents(soup_info)
|
||||
|
||||
def _get_folder_path(self, soup: Any) -> str:
|
||||
"""Get the folder path to save the documents in.
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup4 soup object.
|
||||
|
||||
Returns:
|
||||
Folder path.
|
||||
"""
|
||||
# Get the course name
|
||||
course_name = soup.find("span", {"id": "crumb_1"})
|
||||
if course_name is None:
|
||||
raise ValueError("No course name found.")
|
||||
course_name = course_name.text.strip()
|
||||
# Prepare the folder path
|
||||
course_name_clean = (
|
||||
unquote(course_name)
|
||||
.replace(" ", "_")
|
||||
.replace("/", "_")
|
||||
.replace(":", "_")
|
||||
.replace(",", "_")
|
||||
.replace("?", "_")
|
||||
.replace("'", "_")
|
||||
.replace("!", "_")
|
||||
.replace('"', "_")
|
||||
)
|
||||
# Get the folder path
|
||||
folder_path = Path(".") / course_name_clean
|
||||
return str(folder_path)
|
||||
|
||||
def _get_documents(self, soup: Any) -> List[Document]:
|
||||
"""Fetch content from page and return Documents.
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup4 soup object.
|
||||
|
||||
Returns:
|
||||
List of documents.
|
||||
"""
|
||||
attachments = self._get_attachments(soup)
|
||||
self._download_attachments(attachments)
|
||||
documents = self._load_documents()
|
||||
return documents
|
||||
|
||||
def _get_attachments(self, soup: Any) -> List[str]:
|
||||
"""Get all attachments from a page.
|
||||
|
||||
Args:
|
||||
soup: BeautifulSoup4 soup object.
|
||||
|
||||
Returns:
|
||||
List of attachments.
|
||||
"""
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
# Get content list
|
||||
content_list = soup.find("ul", {"class": "contentList"})
|
||||
if content_list is None:
|
||||
raise ValueError("No content list found.")
|
||||
content_list: BeautifulSoup # type: ignore
|
||||
# Get all attachments
|
||||
attachments = []
|
||||
for attachment in content_list.find_all("ul", {"class": "attachments"}):
|
||||
attachment: Tag # type: ignore
|
||||
for link in attachment.find_all("a"):
|
||||
link: Tag # type: ignore
|
||||
href = link.get("href")
|
||||
# Only add if href is not None and does not start with #
|
||||
if href is not None and not href.startswith("#"):
|
||||
attachments.append(href)
|
||||
return attachments
|
||||
|
||||
def _download_attachments(self, attachments: List[str]) -> None:
|
||||
"""Download all attachments.
|
||||
|
||||
Args:
|
||||
attachments: List of attachments.
|
||||
"""
|
||||
# Make sure the folder exists
|
||||
Path(self.folder_path).mkdir(parents=True, exist_ok=True)
|
||||
# Download all attachments
|
||||
for attachment in attachments:
|
||||
self.download(attachment)
|
||||
|
||||
def _load_documents(self) -> List[Document]:
|
||||
"""Load all documents in the folder.
|
||||
|
||||
Returns:
|
||||
List of documents.
|
||||
"""
|
||||
# Create the document loader
|
||||
loader = DirectoryLoader(
|
||||
path=self.folder_path, glob="*.pdf", loader_cls=PyPDFLoader # type: ignore
|
||||
)
|
||||
# Load the documents
|
||||
documents = loader.load()
|
||||
# Return all documents
|
||||
return documents
|
||||
|
||||
def _get_paths(self, soup: Any) -> List[str]:
|
||||
"""Get all relative paths in the navbar."""
|
||||
relative_paths = []
|
||||
course_menu = soup.find("ul", {"class": "courseMenu"})
|
||||
if course_menu is None:
|
||||
raise ValueError("No course menu found.")
|
||||
for link in course_menu.find_all("a"):
|
||||
href = link.get("href")
|
||||
if href is not None and href.startswith("/"):
|
||||
relative_paths.append(href)
|
||||
return relative_paths
|
||||
|
||||
def download(self, path: str) -> None:
|
||||
"""Download a file from a url.
|
||||
|
||||
Args:
|
||||
path: Path to the file.
|
||||
"""
|
||||
# Get the file content
|
||||
response = self.session.get(self.base_url + path, allow_redirects=True)
|
||||
# Get the filename
|
||||
filename = self.parse_filename(response.url)
|
||||
# Write the file to disk
|
||||
with open(Path(self.folder_path) / filename, "wb") as f:
|
||||
f.write(response.content)
|
||||
|
||||
def parse_filename(self, url: str) -> str:
|
||||
"""Parse the filename from a url.
|
||||
|
||||
Args:
|
||||
url: Url to parse the filename from.
|
||||
|
||||
Returns:
|
||||
The filename.
|
||||
"""
|
||||
if (url_path := Path(url)) and url_path.suffix == ".pdf":
|
||||
return url_path.name
|
||||
else:
|
||||
return self._parse_filename_from_url(url)
|
||||
|
||||
def _parse_filename_from_url(self, url: str) -> str:
|
||||
"""Parse the filename from a url.
|
||||
|
||||
Args:
|
||||
url: Url to parse the filename from.
|
||||
|
||||
Returns:
|
||||
The filename.
|
||||
|
||||
Raises:
|
||||
ValueError: If the filename could not be parsed.
|
||||
"""
|
||||
filename_matches = re.search(r"filename%2A%3DUTF-8%27%27(.+)", url)
|
||||
if filename_matches:
|
||||
filename = filename_matches.group(1)
|
||||
else:
|
||||
raise ValueError(f"Could not parse filename from {url}")
|
||||
if ".pdf" not in filename:
|
||||
raise ValueError(f"Incorrect file type: {filename}")
|
||||
filename = filename.split(".pdf")[0] + ".pdf"
|
||||
filename = unquote(filename)
|
||||
filename = filename.replace("%20", " ")
|
||||
return filename
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
loader = BlackboardLoader(
|
||||
"https://<YOUR BLACKBOARD URL"
|
||||
" HERE>/webapps/blackboard/content/listContent.jsp?course_id=_<YOUR COURSE ID"
|
||||
" HERE>_1&content_id=_<YOUR CONTENT ID HERE>_1&mode=reset",
|
||||
"<YOUR BBROUTER COOKIE HERE>",
|
||||
load_all_recursively=True,
|
||||
)
|
||||
documents = loader.load()
|
||||
print(f"Loaded {len(documents)} pages of PDFs from {loader.web_path}")
|
@ -177,7 +177,7 @@ class GoogleApiYoutubeLoader(BaseLoader):
|
||||
As the service needs a google_api_client, you first have to initialize
|
||||
the GoogleApiClient.
|
||||
|
||||
Additonali you have to either provide a channel name or a list of videoids
|
||||
Additionally you have to either provide a channel name or a list of videoids
|
||||
"https://developers.google.com/docs/api/quickstart/python"
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user