adding webpage loading logic (#942)

1 year ago · 8e126bc9bd
parent c71027e725
commit 8e126bc9bd
10 changed files with 504 additions and 0 deletions
--- a/docs/modules/document_loaders/examples/azlyrics.ipynb
+++ b/docs/modules/document_loaders/examples/azlyrics.ipynb
@ -0,0 +1,93 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "9c31caff",
+   "metadata": {},
+   "source": [
+    "# AZLyrics\n",
+    "This covers how to load AZLyrics webpages into a document format that we can use downstream."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "7e6f5726",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import AZLyricsLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "a0df4c24",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = AZLyricsLoader(\"https://www.azlyrics.com/lyrics/mileycyrus/flowers.html\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "8cd61b6e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "data = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "162fd286",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content=\"Miley Cyrus - Flowers Lyrics | AZLyrics.com\\n\\r\\nWe were good, we were gold\\nKinda dream that can't be sold\\nWe were right till we weren't\\nBuilt a home and watched it burn\\n\\nI didn't wanna leave you\\nI didn't wanna lie\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than you can\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\n\\nPaint my nails, cherry red\\nMatch the roses that you left\\nNo remorse, no regret\\nI forgive every word you said\\n\\nI didn't wanna leave you, baby\\nI didn't wanna fight\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours, yeah\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than you can\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI\\n\\nI didn't wanna wanna leave you\\nI didn't wanna fight\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours (Yeah)\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than\\nYeah, I can love me better than you can, uh\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby (Than you can)\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI\\n\", lookup_str='', metadata={'source': 'https://www.azlyrics.com/lyrics/mileycyrus/flowers.html'}, lookup_index=0)]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6358000c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/docs/modules/document_loaders/examples/college_confidential.ipynb
+++ b/docs/modules/document_loaders/examples/college_confidential.ipynb
--- a/docs/modules/document_loaders/examples/imsdb.ipynb
+++ b/docs/modules/document_loaders/examples/imsdb.ipynb
--- a/docs/modules/document_loaders/examples/web_base.ipynb
+++ b/docs/modules/document_loaders/examples/web_base.ipynb
--- a/docs/modules/document_loaders/how_to_guides.rst
+++ b/docs/modules/document_loaders/how_to_guides.rst
@ -37,6 +37,14 @@ There are a lot of different document loaders that LangChain supports. Below are

 `GCS Directory <./examples/gcs_directory.html>`_: A walkthrough of how to load all files in a directory from Google Cloud Storage (GCS).

+`Web Base <./examples/web_base.html>`_: A walkthrough of how to load all text data from webpages.
+
+`IMSDb <./examples/imsdb.html>`_: A walkthrough of how to load all text data from IMSDb webpage.
+
+`AZLyrics <./examples/azlyrics.html>`_: A walkthrough of how to load all text data from AZLyrics webpage.
+
+`College Confidential <./examples/college_confidential.html>`_: A walkthrough of how to load all text data from College Confidential webpage.
+
 `Gutenberg <./examples/gutenberg.html>`_: A walkthrough of how to load data from a Gutenberg ebook text.

 .. toctree::
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -1,5 +1,7 @@
 """All different types of document loaders."""

+from langchain.document_loaders.azlyrics import AZLyricsLoader
+from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
 from langchain.document_loaders.directory import DirectoryLoader
 from langchain.document_loaders.docx import UnstructuredDocxLoader
 from langchain.document_loaders.email import UnstructuredEmailLoader
@ -8,6 +10,7 @@ from langchain.document_loaders.gcs_file import GCSFileLoader
 from langchain.document_loaders.googledrive import GoogleDriveLoader
 from langchain.document_loaders.gutenberg import GutenbergLoader
 from langchain.document_loaders.html import UnstructuredHTMLLoader
+from langchain.document_loaders.imsdb import IMSDbLoader
 from langchain.document_loaders.notion import NotionDirectoryLoader
 from langchain.document_loaders.obsidian import ObsidianLoader
 from langchain.document_loaders.pdf import UnstructuredPDFLoader
@ -17,6 +20,7 @@ from langchain.document_loaders.roam import RoamLoader
 from langchain.document_loaders.s3_directory import S3DirectoryLoader
 from langchain.document_loaders.s3_file import S3FileLoader
 from langchain.document_loaders.unstructured import UnstructuredFileLoader
+from langchain.document_loaders.web_base import WebBaseLoader
 from langchain.document_loaders.youtube import YoutubeLoader

 __all__ = [
@ -37,5 +41,9 @@ __all__ = [
    "S3DirectoryLoader",
    "GCSFileLoader",
    "GCSDirectoryLoader",
+    "WebBaseLoader",
+    "IMSDbLoader",
+    "AZLyricsLoader",
+    "CollegeConfidentialLoader",
    "GutenbergLoader",
 ]
--- a/langchain/document_loaders/azlyrics.py
+++ b/langchain/document_loaders/azlyrics.py
@ -0,0 +1,22 @@
+"""Loader that loads AZLyrics."""
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.web_base import WebBaseLoader
+
+
+class AZLyricsLoader(WebBaseLoader):
+    """Loader that loads AZLyrics webpages."""
+
+    def __init__(self, web_path: str):
+        """Initialize with webpage path."""
+        self.web_path = web_path
+
+    def load(self) -> List[Document]:
+        """Load webpage."""
+        soup = self.scrape()
+        title = soup.title.text
+        lyrics = soup.find_all("div", {"class": ""})[2].text
+        text = title + lyrics
+        metadata = {"source": self.web_path}
+        return [Document(page_content=text, metadata=metadata)]
--- a/langchain/document_loaders/college_confidential.py
+++ b/langchain/document_loaders/college_confidential.py
@ -0,0 +1,20 @@
+"""Loader that loads College Confidential."""
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.web_base import WebBaseLoader
+
+
+class CollegeConfidentialLoader(WebBaseLoader):
+    """Loader that loads College Confidential webpages."""
+
+    def __init__(self, web_path: str):
+        """Initialize with webpage path."""
+        self.web_path = web_path
+
+    def load(self) -> List[Document]:
+        """Load webpage."""
+        soup = self.scrape()
+        text = soup.select_one("main[class='skin-handler']").text
+        metadata = {"source": self.web_path}
+        return [Document(page_content=text, metadata=metadata)]
--- a/langchain/document_loaders/imsdb.py
+++ b/langchain/document_loaders/imsdb.py
@ -0,0 +1,20 @@
+"""Loader that loads IMSDb."""
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.web_base import WebBaseLoader
+
+
+class IMSDbLoader(WebBaseLoader):
+    """Loader that loads IMSDb webpages."""
+
+    def __init__(self, web_path: str):
+        """Initialize with webpage path."""
+        self.web_path = web_path
+
+    def load(self) -> List[Document]:
+        """Load webpage."""
+        soup = self.scrape()
+        text = soup.select_one("td[class='scrtext']").text
+        metadata = {"source": self.web_path}
+        return [Document(page_content=text, metadata=metadata)]
--- a/langchain/document_loaders/web_base.py
+++ b/langchain/document_loaders/web_base.py
@ -0,0 +1,29 @@
+"""Web base loader class."""
+from typing import List
+
+import requests
+from bs4 import BeautifulSoup
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class WebBaseLoader(BaseLoader):
+    """Loader that uses urllib and beautiful soup to load webpages."""
+
+    def __init__(self, web_path: str):
+        """Initialize with webpage path."""
+        self.web_path = web_path
+
+    def scrape(self) -> BeautifulSoup:
+        """Scrape data from webpage and return it in BeautifulSoup format."""
+        html_doc = requests.get(self.web_path)
+        soup = BeautifulSoup(html_doc.text, "html.parser")
+        return soup
+
+    def load(self) -> List[Document]:
+        """Load data into document objects."""
+        soup = self.scrape()
+        text = soup.get_text()
+        metadata = {"source": self.web_path}
+        return [Document(page_content=text, metadata=metadata)]