adding webpage loading logic (#942)

makefile-update-1
zanderchase 1 year ago committed by GitHub
parent c71027e725
commit 8e126bc9bd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,93 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "9c31caff",
"metadata": {},
"source": [
"# AZLyrics\n",
"This covers how to load AZLyrics webpages into a document format that we can use downstream."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "7e6f5726",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import AZLyricsLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "a0df4c24",
"metadata": {},
"outputs": [],
"source": [
"loader = AZLyricsLoader(\"https://www.azlyrics.com/lyrics/mileycyrus/flowers.html\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "8cd61b6e",
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "162fd286",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content=\"Miley Cyrus - Flowers Lyrics | AZLyrics.com\\n\\r\\nWe were good, we were gold\\nKinda dream that can't be sold\\nWe were right till we weren't\\nBuilt a home and watched it burn\\n\\nI didn't wanna leave you\\nI didn't wanna lie\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than you can\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\n\\nPaint my nails, cherry red\\nMatch the roses that you left\\nNo remorse, no regret\\nI forgive every word you said\\n\\nI didn't wanna leave you, baby\\nI didn't wanna fight\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours, yeah\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than you can\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI\\n\\nI didn't wanna wanna leave you\\nI didn't wanna fight\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours (Yeah)\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than\\nYeah, I can love me better than you can, uh\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby (Than you can)\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI\\n\", lookup_str='', metadata={'source': 'https://www.azlyrics.com/lyrics/mileycyrus/flowers.html'}, lookup_index=0)]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6358000c",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -37,6 +37,14 @@ There are a lot of different document loaders that LangChain supports. Below are
`GCS Directory <./examples/gcs_directory.html>`_: A walkthrough of how to load all files in a directory from Google Cloud Storage (GCS).
`Web Base <./examples/web_base.html>`_: A walkthrough of how to load all text data from webpages.
`IMSDb <./examples/imsdb.html>`_: A walkthrough of how to load all text data from IMSDb webpage.
`AZLyrics <./examples/azlyrics.html>`_: A walkthrough of how to load all text data from AZLyrics webpage.
`College Confidential <./examples/college_confidential.html>`_: A walkthrough of how to load all text data from College Confidential webpage.
`Gutenberg <./examples/gutenberg.html>`_: A walkthrough of how to load data from a Gutenberg ebook text.
.. toctree::

@ -1,5 +1,7 @@
"""All different types of document loaders."""
from langchain.document_loaders.azlyrics import AZLyricsLoader
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
from langchain.document_loaders.directory import DirectoryLoader
from langchain.document_loaders.docx import UnstructuredDocxLoader
from langchain.document_loaders.email import UnstructuredEmailLoader
@ -8,6 +10,7 @@ from langchain.document_loaders.gcs_file import GCSFileLoader
from langchain.document_loaders.googledrive import GoogleDriveLoader
from langchain.document_loaders.gutenberg import GutenbergLoader
from langchain.document_loaders.html import UnstructuredHTMLLoader
from langchain.document_loaders.imsdb import IMSDbLoader
from langchain.document_loaders.notion import NotionDirectoryLoader
from langchain.document_loaders.obsidian import ObsidianLoader
from langchain.document_loaders.pdf import UnstructuredPDFLoader
@ -17,6 +20,7 @@ from langchain.document_loaders.roam import RoamLoader
from langchain.document_loaders.s3_directory import S3DirectoryLoader
from langchain.document_loaders.s3_file import S3FileLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from langchain.document_loaders.web_base import WebBaseLoader
from langchain.document_loaders.youtube import YoutubeLoader
__all__ = [
@ -37,5 +41,9 @@ __all__ = [
"S3DirectoryLoader",
"GCSFileLoader",
"GCSDirectoryLoader",
"WebBaseLoader",
"IMSDbLoader",
"AZLyricsLoader",
"CollegeConfidentialLoader",
"GutenbergLoader",
]

@ -0,0 +1,22 @@
"""Loader that loads AZLyrics."""
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.web_base import WebBaseLoader
class AZLyricsLoader(WebBaseLoader):
"""Loader that loads AZLyrics webpages."""
def __init__(self, web_path: str):
"""Initialize with webpage path."""
self.web_path = web_path
def load(self) -> List[Document]:
"""Load webpage."""
soup = self.scrape()
title = soup.title.text
lyrics = soup.find_all("div", {"class": ""})[2].text
text = title + lyrics
metadata = {"source": self.web_path}
return [Document(page_content=text, metadata=metadata)]

@ -0,0 +1,20 @@
"""Loader that loads College Confidential."""
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.web_base import WebBaseLoader
class CollegeConfidentialLoader(WebBaseLoader):
"""Loader that loads College Confidential webpages."""
def __init__(self, web_path: str):
"""Initialize with webpage path."""
self.web_path = web_path
def load(self) -> List[Document]:
"""Load webpage."""
soup = self.scrape()
text = soup.select_one("main[class='skin-handler']").text
metadata = {"source": self.web_path}
return [Document(page_content=text, metadata=metadata)]

@ -0,0 +1,20 @@
"""Loader that loads IMSDb."""
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.web_base import WebBaseLoader
class IMSDbLoader(WebBaseLoader):
"""Loader that loads IMSDb webpages."""
def __init__(self, web_path: str):
"""Initialize with webpage path."""
self.web_path = web_path
def load(self) -> List[Document]:
"""Load webpage."""
soup = self.scrape()
text = soup.select_one("td[class='scrtext']").text
metadata = {"source": self.web_path}
return [Document(page_content=text, metadata=metadata)]

@ -0,0 +1,29 @@
"""Web base loader class."""
from typing import List
import requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class WebBaseLoader(BaseLoader):
"""Loader that uses urllib and beautiful soup to load webpages."""
def __init__(self, web_path: str):
"""Initialize with webpage path."""
self.web_path = web_path
def scrape(self) -> BeautifulSoup:
"""Scrape data from webpage and return it in BeautifulSoup format."""
html_doc = requests.get(self.web_path)
soup = BeautifulSoup(html_doc.text, "html.parser")
return soup
def load(self) -> List[Document]:
"""Load data into document objects."""
soup = self.scrape()
text = soup.get_text()
metadata = {"source": self.web_path}
return [Document(page_content=text, metadata=metadata)]
Loading…
Cancel
Save