forked from Archives/langchain
adding webpage loading logic (#942)
parent
c71027e725
commit
8e126bc9bd
@ -0,0 +1,93 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "9c31caff",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# AZLyrics\n",
|
||||
"This covers how to load AZLyrics webpages into a document format that we can use downstream."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "7e6f5726",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import AZLyricsLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "a0df4c24",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = AZLyricsLoader(\"https://www.azlyrics.com/lyrics/mileycyrus/flowers.html\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "8cd61b6e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"data = loader.load()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "162fd286",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content=\"Miley Cyrus - Flowers Lyrics | AZLyrics.com\\n\\r\\nWe were good, we were gold\\nKinda dream that can't be sold\\nWe were right till we weren't\\nBuilt a home and watched it burn\\n\\nI didn't wanna leave you\\nI didn't wanna lie\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than you can\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\n\\nPaint my nails, cherry red\\nMatch the roses that you left\\nNo remorse, no regret\\nI forgive every word you said\\n\\nI didn't wanna leave you, baby\\nI didn't wanna fight\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours, yeah\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than you can\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI\\n\\nI didn't wanna wanna leave you\\nI didn't wanna fight\\nStarted to cry but then remembered I\\n\\nI can buy myself flowers\\nWrite my name in the sand\\nTalk to myself for hours (Yeah)\\nSay things you don't understand\\nI can take myself dancing\\nAnd I can hold my own hand\\nYeah, I can love me better than\\nYeah, I can love me better than you can, uh\\n\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI can love me better, baby (Than you can)\\nCan love me better\\nI can love me better, baby\\nCan love me better\\nI\\n\", lookup_str='', metadata={'source': 'https://www.azlyrics.com/lyrics/mileycyrus/flowers.html'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "6358000c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.8.1"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -0,0 +1,22 @@
|
||||
"""Loader that loads AZLyrics."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class AZLyricsLoader(WebBaseLoader):
|
||||
"""Loader that loads AZLyrics webpages."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with webpage path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpage."""
|
||||
soup = self.scrape()
|
||||
title = soup.title.text
|
||||
lyrics = soup.find_all("div", {"class": ""})[2].text
|
||||
text = title + lyrics
|
||||
metadata = {"source": self.web_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
@ -0,0 +1,20 @@
|
||||
"""Loader that loads College Confidential."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class CollegeConfidentialLoader(WebBaseLoader):
|
||||
"""Loader that loads College Confidential webpages."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with webpage path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpage."""
|
||||
soup = self.scrape()
|
||||
text = soup.select_one("main[class='skin-handler']").text
|
||||
metadata = {"source": self.web_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
@ -0,0 +1,20 @@
|
||||
"""Loader that loads IMSDb."""
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.web_base import WebBaseLoader
|
||||
|
||||
|
||||
class IMSDbLoader(WebBaseLoader):
|
||||
"""Loader that loads IMSDb webpages."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with webpage path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load webpage."""
|
||||
soup = self.scrape()
|
||||
text = soup.select_one("td[class='scrtext']").text
|
||||
metadata = {"source": self.web_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
@ -0,0 +1,29 @@
|
||||
"""Web base loader class."""
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
class WebBaseLoader(BaseLoader):
|
||||
"""Loader that uses urllib and beautiful soup to load webpages."""
|
||||
|
||||
def __init__(self, web_path: str):
|
||||
"""Initialize with webpage path."""
|
||||
self.web_path = web_path
|
||||
|
||||
def scrape(self) -> BeautifulSoup:
|
||||
"""Scrape data from webpage and return it in BeautifulSoup format."""
|
||||
html_doc = requests.get(self.web_path)
|
||||
soup = BeautifulSoup(html_doc.text, "html.parser")
|
||||
return soup
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load data into document objects."""
|
||||
soup = self.scrape()
|
||||
text = soup.get_text()
|
||||
metadata = {"source": self.web_path}
|
||||
return [Document(page_content=text, metadata=metadata)]
|
Loading…
Reference in New Issue