From 5a084e1b208f6844acb4e5d9b7a4fabf5d5a1c51 Mon Sep 17 00:00:00 2001 From: Lance Martin <122662504+rlancemartin@users.noreply.github.com> Date: Thu, 20 Jul 2023 22:30:59 -0700 Subject: [PATCH] Async HTML loader and HTML2Text transformer (#8036) New HTML loader that asynchronously loader a list of urls. New transformer using [HTML2Text](https://github.com/Alir3z4/html2text/) for HTML to clean, easy-to-read plain ASCII text (valid Markdown). --- .../integrations/async_html.ipynb | 107 ++++++++++++++ .../integrations/html2text.ipynb | 133 +++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/async_html.py | 138 ++++++++++++++++++ langchain/document_transformers/__init__.py | 2 + langchain/document_transformers/html2text.py | 41 ++++++ 6 files changed, 423 insertions(+) create mode 100644 docs/extras/modules/data_connection/document_loaders/integrations/async_html.ipynb create mode 100644 docs/extras/modules/data_connection/document_transformers/integrations/html2text.ipynb create mode 100644 langchain/document_loaders/async_html.py create mode 100644 langchain/document_transformers/html2text.py diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/async_html.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/async_html.ipynb new file mode 100644 index 0000000000..64cced79ad --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/async_html.ipynb @@ -0,0 +1,107 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e229e34c", + "metadata": {}, + "source": [ + "# AsyncHtmlLoader\n", + "\n", + "AsyncHtmlLoader loads raw HTML from a list of urls concurrently." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "4c8e4dab", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import AsyncHtmlLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "e76b5ddc", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Fetching pages: 100%|############| 2/2 [00:00<00:00, 9.96it/s]\n" + ] + } + ], + "source": [ + "urls = [\"https://www.espn.com\", \"https://lilianweng.github.io/posts/2023-06-23-agent/\"]\n", + "loader = AsyncHtmlLoader(urls)\n", + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "5dca1c0c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "' news. Stream exclusive games on ESPN+ and play fantasy sports.\" />\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n\\n str: + async with aiohttp.ClientSession() as session: + for i in range(retries): + try: + async with session.get( + url, + headers=self.session.headers, + ssl=None if self.session.verify else False, + ) as response: + return await response.text() + except aiohttp.ClientConnectionError as e: + if i == retries - 1: + raise + else: + logger.warning( + f"Error fetching {url} with attempt " + f"{i + 1}/{retries}: {e}. Retrying..." + ) + await asyncio.sleep(cooldown * backoff**i) + raise ValueError("retry count exceeded") + + async def _fetch_with_rate_limit( + self, url: str, semaphore: asyncio.Semaphore + ) -> str: + async with semaphore: + return await self._fetch(url) + + async def fetch_all(self, urls: List[str]) -> Any: + """Fetch all urls concurrently with rate limiting.""" + semaphore = asyncio.Semaphore(self.requests_per_second) + tasks = [] + for url in urls: + task = asyncio.ensure_future(self._fetch_with_rate_limit(url, semaphore)) + tasks.append(task) + try: + from tqdm.asyncio import tqdm_asyncio + + return await tqdm_asyncio.gather( + *tasks, desc="Fetching pages", ascii=True, mininterval=1 + ) + except ImportError: + warnings.warn("For better logging of progress, `pip install tqdm`") + return await asyncio.gather(*tasks) + + def lazy_load(self) -> Iterator[Document]: + """Lazy load text from the url(s) in web_path.""" + for doc in self.load(): + yield doc + + def load(self) -> List[Document]: + """Load text from the url(s) in web_path.""" + + results = asyncio.run(self.fetch_all(self.web_paths)) + docs = [] + for i, text in enumerate(results): + metadata = {"source": self.web_paths[i]} + docs.append(Document(page_content=text, metadata=metadata)) + + return docs diff --git a/langchain/document_transformers/__init__.py b/langchain/document_transformers/__init__.py index 9f1489219f..43678d2e75 100644 --- a/langchain/document_transformers/__init__.py +++ b/langchain/document_transformers/__init__.py @@ -8,6 +8,7 @@ from langchain.document_transformers.embeddings_redundant_filter import ( EmbeddingsRedundantFilter, get_stateful_documents, ) +from langchain.document_transformers.html2text import Html2TextTransformer from langchain.document_transformers.long_context_reorder import LongContextReorder __all__ = [ @@ -19,6 +20,7 @@ __all__ = [ "get_stateful_documents", "LongContextReorder", "OpenAIMetadataTagger", + "Html2TextTransformer", ] from langchain.document_transformers.openai_functions import OpenAIMetadataTagger diff --git a/langchain/document_transformers/html2text.py b/langchain/document_transformers/html2text.py new file mode 100644 index 0000000000..d498f5505d --- /dev/null +++ b/langchain/document_transformers/html2text.py @@ -0,0 +1,41 @@ +from typing import Any, Sequence + +from langchain.schema import BaseDocumentTransformer, Document + + +class Html2TextTransformer(BaseDocumentTransformer): + """Replace occurrences of a particular search pattern with a replacement string + Example: + .. code-block:: python + from langchain.document_transformers import Html2TextTransformer + html2text=Html2TextTransformer() + docs_transform=html2text.transform_documents(docs) + """ + + def transform_documents( + self, + documents: Sequence[Document], + **kwargs: Any, + ) -> Sequence[Document]: + try: + import html2text + except ImportError: + raise ValueError( + """html2text package not found, please + install it with `pip install html2text`""" + ) + + # Create an html2text.HTML2Text object and override some properties + h = html2text.HTML2Text() + h.ignore_links = True + h.ignore_images = True + for d in documents: + d.page_content = h.handle(d.page_content) + return documents + + async def atransform_documents( + self, + documents: Sequence[Document], + **kwargs: Any, + ) -> Sequence[Document]: + raise NotImplementedError