diff --git a/docs/modules/document_loaders/examples/example_data/fake-content.html b/docs/modules/document_loaders/examples/example_data/fake-content.html index 9ad19d30..acba7602 100644 --- a/docs/modules/document_loaders/examples/example_data/fake-content.html +++ b/docs/modules/document_loaders/examples/example_data/fake-content.html @@ -1,5 +1,8 @@ + + Test Title +

My First Heading

diff --git a/docs/modules/document_loaders/examples/html.ipynb b/docs/modules/document_loaders/examples/html.ipynb index 2a498828..91ff32e0 100644 --- a/docs/modules/document_loaders/examples/html.ipynb +++ b/docs/modules/document_loaders/examples/html.ipynb @@ -48,9 +48,7 @@ "outputs": [ { "data": { - "text/plain": [ - "[Document(page_content='My First Heading\\n\\nMy first paragraph.', lookup_str='', metadata={'source': 'example_data/fake-content.html'}, lookup_index=0)]" - ] + "text/plain": "[Document(page_content='My First Heading\\n\\nMy first paragraph.', lookup_str='', metadata={'source': 'example_data/fake-content.html'}, lookup_index=0)]" }, "execution_count": 4, "metadata": {}, @@ -61,13 +59,57 @@ "data" ] }, + { + "cell_type": "markdown", + "source": [ + "## Loading HTML with BeautifulSoup4\n", + "\n", + "We can also use BeautifulSoup4 to load HTML documents using the `BSHTMLLoader`. This will extract the text from the html into `page_content`, and the page title as `title` into `metadata`." + ], + "metadata": { + "collapsed": false + } + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "id": "79b1bce4", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "from langchain.document_loaders import BSHTMLLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "outputs": [ + { + "data": { + "text/plain": "[Document(page_content='\\n\\nTest Title\\n\\n\\nMy First Heading\\nMy first paragraph.\\n\\n\\n', lookup_str='', metadata={'source': 'example_data/fake-content.html', 'title': 'Test Title'}, lookup_index=0)]" + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader = BSHTMLLoader(\"example_data/fake-content.html\")\n", + "data = loader.load()\n", + "data" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + } } ], "metadata": { diff --git a/langchain/document_loaders/html_bs.py b/langchain/document_loaders/html_bs.py index 92802ccb..568d8054 100644 --- a/langchain/document_loaders/html_bs.py +++ b/langchain/document_loaders/html_bs.py @@ -3,8 +3,6 @@ import logging from typing import Dict, List, Union -from bs4 import BeautifulSoup - from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader @@ -15,9 +13,18 @@ class BSHTMLLoader(BaseLoader): """Loader that uses beautiful soup to parse HTML files.""" def __init__(self, file_path: str) -> None: + try: + import bs4 # noqa:F401 + except ImportError: + raise ValueError( + "bs4 package not found, please install it with " "`pip install bs4`" + ) + self.file_path = file_path def load(self) -> List[Document]: + from bs4 import BeautifulSoup + """Load HTML document into document objects.""" with open(self.file_path, "r") as f: soup = BeautifulSoup(f, features="lxml")