diff --git a/docs/modules/document_loaders/examples/url.ipynb b/docs/modules/document_loaders/examples/url.ipynb new file mode 100644 index 0000000000..c24c911960 --- /dev/null +++ b/docs/modules/document_loaders/examples/url.ipynb @@ -0,0 +1,78 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "2dfc4698", + "metadata": {}, + "source": [ + "# URL\n", + "\n", + "This covers how to load HTML documents from a list of URLs into a document format that we can use downstream." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "16c3699e", + "metadata": {}, + "outputs": [], + "source": [ + " from langchain.document_loaders import UnstructuredURLLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "836fbac1", + "metadata": {}, + "outputs": [], + "source": [ + "urls = [\n", + " \"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-8-2023\",\n", + " \"https://www.understandingwar.org/backgrounder/russian-offensive-campaign-assessment-february-9-2023\"\n", + "]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "00f46fda", + "metadata": {}, + "outputs": [], + "source": [ + "loader = UnstructuredURLLoader(urls=urls)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "b68a26b3", + "metadata": {}, + "outputs": [], + "source": [ + "data = loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index df7cf4ea40..714cd99628 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -22,11 +22,13 @@ from langchain.document_loaders.roam import RoamLoader from langchain.document_loaders.s3_directory import S3DirectoryLoader from langchain.document_loaders.s3_file import S3FileLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader +from langchain.document_loaders.url import UnstructuredURLLoader from langchain.document_loaders.web_base import WebBaseLoader from langchain.document_loaders.youtube import YoutubeLoader __all__ = [ "UnstructuredFileLoader", + "UnstructuredURLLoader", "DirectoryLoader", "NotionDirectoryLoader", "ReadTheDocsLoader", diff --git a/langchain/document_loaders/url.py b/langchain/document_loaders/url.py new file mode 100644 index 0000000000..1673f194f2 --- /dev/null +++ b/langchain/document_loaders/url.py @@ -0,0 +1,32 @@ +"""Loader that loads PDF files.""" +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class UnstructuredURLLoader(BaseLoader): + """Loader that uses unstructured to load HTML files.""" + + def __init__(self, urls: List[str]): + """Initialize with file path.""" + try: + import unstructured # noqa:F401 + except ImportError: + raise ValueError( + "unstructured package not found, please install it with " + "`pip install unstructured`" + ) + self.urls = urls + + def load(self) -> List[Document]: + """Load file.""" + from unstructured.partition.html import partition_html + + docs: List[Document] = list() + for url in self.urls: + elements = partition_html(url=url) + text = "\n\n".join([str(el) for el in elements]) + metadata = {"source": url} + docs.append(Document(page_content=text, metadata=metadata)) + return docs