feat: adds UnstructuredURLLoader for loading data from urls (#979)

### Summary

Adds a `UnstructuredURLLoader` that supports loading data from a list of
URLs.


### Testing

```python
from langchain.document_loaders import UnstructuredURLLoader

urls = [
    "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023",
    "https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023"
]
loader = UnstructuredURLLoader(urls=urls)
raw_documents = loader.load()
```
This commit is contained in:
Matt Robinson 2023-02-10 13:18:38 -05:00 committed by GitHub
parent c64f98e2bb
commit 07a407d89a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 112 additions and 0 deletions

View File

@ -0,0 +1,78 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "2dfc4698",
"metadata": {},
"source": [
"# URL\n",
"\n",
"This covers how to load HTML documents from a list of URLs into a document format that we can use downstream."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "16c3699e",
"metadata": {},
"outputs": [],
"source": [
" from langchain.document_loaders import UnstructuredURLLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "836fbac1",
"metadata": {},
"outputs": [],
"source": [
"urls = [\n",
" \"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023\",\n",
" \"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023\"\n",
"]\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "00f46fda",
"metadata": {},
"outputs": [],
"source": [
"loader = UnstructuredURLLoader(urls=urls)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "b68a26b3",
"metadata": {},
"outputs": [],
"source": [
"data = loader.load()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.13"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -22,11 +22,13 @@ from langchain.document_loaders.roam import RoamLoader
from langchain.document_loaders.s3_directory import S3DirectoryLoader
from langchain.document_loaders.s3_file import S3FileLoader
from langchain.document_loaders.unstructured import UnstructuredFileLoader
from langchain.document_loaders.url import UnstructuredURLLoader
from langchain.document_loaders.web_base import WebBaseLoader
from langchain.document_loaders.youtube import YoutubeLoader
__all__ = [
"UnstructuredFileLoader",
"UnstructuredURLLoader",
"DirectoryLoader",
"NotionDirectoryLoader",
"ReadTheDocsLoader",

View File

@ -0,0 +1,32 @@
"""Loader that loads PDF files."""
from typing import List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class UnstructuredURLLoader(BaseLoader):
"""Loader that uses unstructured to load HTML files."""
def __init__(self, urls: List[str]):
"""Initialize with file path."""
try:
import unstructured # noqa:F401
except ImportError:
raise ValueError(
"unstructured package not found, please install it with "
"`pip install unstructured`"
)
self.urls = urls
def load(self) -> List[Document]:
"""Load file."""
from unstructured.partition.html import partition_html
docs: List[Document] = list()
for url in self.urls:
elements = partition_html(url=url)
text = "\n\n".join([str(el) for el in elements])
metadata = {"source": url}
docs.append(Document(page_content=text, metadata=metadata))
return docs