Harrison/site map (#2061)

Co-authored-by: Tim Asp <707699+timothyasp@users.noreply.github.com>
searx
Harrison Chase 1 year ago committed by GitHub
parent 8b5a43d720
commit a0cd6672aa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -85,7 +85,7 @@
"name": "python", "name": "python",
"nbconvert_exporter": "python", "nbconvert_exporter": "python",
"pygments_lexer": "ipython3", "pygments_lexer": "ipython3",
"version": "3.8.1" "version": "3.9.1"
} }
}, },
"nbformat": 4, "nbformat": 4,

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

@ -16,7 +16,7 @@ See [this notebook](../modules/indexes/getting_started.ipynb) for a more detaile
from langchain.document_loaders import TextLoader from langchain.document_loaders import TextLoader
loader = TextLoader('../state_of_the_union.txt') loader = TextLoader('../state_of_the_union.txt')
``` ```
See [here](../modules/document_loaders/how_to_guides.rst) for more information on how to get started with document loading. See [here](../modules/indexes/document_loaders.rst) for more information on how to get started with document loading.
**Create Your Index** **Create Your Index**
```python ```python

@ -43,6 +43,7 @@ from langchain.document_loaders.readthedocs import ReadTheDocsLoader
from langchain.document_loaders.roam import RoamLoader from langchain.document_loaders.roam import RoamLoader
from langchain.document_loaders.s3_directory import S3DirectoryLoader from langchain.document_loaders.s3_directory import S3DirectoryLoader
from langchain.document_loaders.s3_file import S3FileLoader from langchain.document_loaders.s3_file import S3FileLoader
from langchain.document_loaders.sitemap import SitemapLoader
from langchain.document_loaders.srt import SRTLoader from langchain.document_loaders.srt import SRTLoader
from langchain.document_loaders.telegram import TelegramChatLoader from langchain.document_loaders.telegram import TelegramChatLoader
from langchain.document_loaders.text import TextLoader from langchain.document_loaders.text import TextLoader
@ -112,4 +113,5 @@ __all__ = [
"BlackboardLoader", "BlackboardLoader",
"AzureBlobStorageFileLoader", "AzureBlobStorageFileLoader",
"AzureBlobStorageContainerLoader", "AzureBlobStorageContainerLoader",
"SitemapLoader",
] ]

@ -0,0 +1,69 @@
"""Loader that fetches a sitemap and loads those URLs."""
import re
from typing import Any, List, Optional
from langchain.document_loaders.web_base import WebBaseLoader
from langchain.schema import Document
class SitemapLoader(WebBaseLoader):
"""Loader that fetches a sitemap and loads those URLs."""
def __init__(self, web_path: str, filter_urls: Optional[List[str]] = None):
"""Initialize with webpage path and optional filter URLs.
Args:
web_path: url of the sitemap
filter_urls: list of strings or regexes that will be applied to filter the
urls that are parsed and loaded
"""
try:
import lxml # noqa:F401
except ImportError:
raise ValueError(
"lxml package not found, please install it with " "`pip install lxml`"
)
super().__init__(web_path)
self.filter_urls = filter_urls
def parse_sitemap(self, soup: Any) -> List[dict]:
"""Parse sitemap xml and load into a list of dicts."""
els = []
for url in soup.find_all("url"):
loc = url.find("loc")
if not loc:
continue
if self.filter_urls and not any(
re.match(r, loc.text) for r in self.filter_urls
):
continue
els.append(
{
tag: prop.text
for tag in ["loc", "lastmod", "changefreq", "priority"]
if (prop := url.find(tag))
}
)
return els
def load(self) -> List[Document]:
"""Load sitemap."""
soup = self.scrape("xml")
els = self.parse_sitemap(soup)
results = self.scrape_all([el["loc"] for el in els if "loc" in el])
return [
Document(
page_content=str(results[i].get_text()),
metadata={**{"source": els[i]["loc"]}, **els[i]},
)
for i in range(len(results))
]

@ -1,7 +1,9 @@
"""Web base loader class.""" """Web base loader class."""
import asyncio
import logging import logging
from typing import Any, List, Optional from typing import Any, List, Optional, Union
import aiohttp
import requests import requests
from langchain.docstore.document import Document from langchain.docstore.document import Document
@ -24,10 +26,34 @@ default_header_template = {
class WebBaseLoader(BaseLoader): class WebBaseLoader(BaseLoader):
"""Loader that uses urllib and beautiful soup to load webpages.""" """Loader that uses urllib and beautiful soup to load webpages."""
def __init__(self, web_path: str, header_template: Optional[dict] = None): web_paths: List[str]
requests_per_second: int = 2
"""Max number of concurrent requests to make."""
default_parser: str = "html.parser"
"""Default parser to use for BeautifulSoup."""
def __init__(
self, web_path: Union[str, List[str]], header_template: Optional[dict] = None
):
"""Initialize with webpage path.""" """Initialize with webpage path."""
self.web_path = web_path
# TODO: Deprecate web_path in favor of web_paths, and remove this
# left like this because there are a number of loaders that expect single
# urls
if isinstance(web_path, str):
self.web_paths = [web_path]
elif isinstance(web_path, List):
self.web_paths = web_path
self.session = requests.Session() self.session = requests.Session()
try:
import bs4 # noqa:F401
except ImportError:
raise ValueError(
"bs4 package not found, please install it with " "`pip install bs4`"
)
try: try:
from fake_useragent import UserAgent from fake_useragent import UserAgent
@ -41,20 +67,91 @@ class WebBaseLoader(BaseLoader):
"To get a realistic header for requests, `pip install fake_useragent`." "To get a realistic header for requests, `pip install fake_useragent`."
) )
def _scrape(self, url: str) -> Any: @property
def web_path(self) -> str:
if len(self.web_paths) > 1:
raise ValueError("Multiple webpaths found.")
return self.web_paths[0]
async def _fetch(self, url: str) -> str:
async with aiohttp.ClientSession() as session:
async with session.get(url, headers=self.session.headers) as response:
return await response.text()
async def _fetch_with_rate_limit(
self, url: str, semaphore: asyncio.Semaphore
) -> str:
async with semaphore:
return await self._fetch(url)
async def fetch_all(self, urls: List[str]) -> Any:
"""Fetch all urls concurrently with rate limiting."""
semaphore = asyncio.Semaphore(self.requests_per_second)
tasks = []
for url in urls:
task = asyncio.ensure_future(self._fetch_with_rate_limit(url, semaphore))
tasks.append(task)
return await asyncio.gather(*tasks)
@staticmethod
def _check_parser(parser: str) -> None:
"""Check that parser is valid for bs4."""
valid_parsers = ["html.parser", "lxml", "xml", "lxml-xml", "html5lib"]
if parser not in valid_parsers:
raise ValueError(
"`parser` must be one of " + ", ".join(valid_parsers) + "."
)
def scrape_all(self, urls: List[str], parser: Union[str, None] = None) -> List[Any]:
"""Fetch all urls, then return soups for all results."""
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
if parser is None:
parser = self.default_parser
self._check_parser(parser)
results = asyncio.run(self.fetch_all(urls))
return [BeautifulSoup(result, parser) for result in results]
def _scrape(self, url: str, parser: Union[str, None] = None) -> Any:
from bs4 import BeautifulSoup
if parser is None:
parser = self.default_parser
self._check_parser(parser)
html_doc = self.session.get(url) html_doc = self.session.get(url)
soup = BeautifulSoup(html_doc.text, "html.parser") return BeautifulSoup(html_doc.text, parser)
return soup
def scrape(self) -> Any: def scrape(self, parser: Union[str, None] = None) -> Any:
"""Scrape data from webpage and return it in BeautifulSoup format.""" """Scrape data from webpage and return it in BeautifulSoup format."""
return self._scrape(self.web_path)
if parser is None:
parser = self.default_parser
return self._scrape(self.web_path, parser)
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load data into document objects.""" """Load text from the url(s) in web_path."""
soup = self.scrape() docs = []
text = soup.get_text() for path in self.web_paths:
metadata = {"source": self.web_path} soup = self._scrape(path)
return [Document(page_content=text, metadata=metadata)] text = soup.get_text()
metadata = {"source": path}
docs.append(Document(page_content=text, metadata=metadata))
return docs
def aload(self) -> List[Document]:
"""Load text from the urls in web_path async into Documents."""
results = self.scrape_all(self.web_paths)
docs = []
for i in range(len(results)):
text = results[i].get_text()
metadata = {"source": self.web_paths[i]}
docs.append(Document(page_content=text, metadata=metadata))
return docs

@ -0,0 +1,20 @@
from langchain.document_loaders import SitemapLoader
def test_sitemap() -> None:
"""Test sitemap loader."""
loader = SitemapLoader("https://langchain.readthedocs.io/sitemap.xml")
documents = loader.load()
assert len(documents) > 1
assert "🦜🔗" in documents[0].page_content
def test_filter_sitemap() -> None:
"""Test sitemap loader."""
loader = SitemapLoader(
"https://langchain.readthedocs.io/sitemap.xml",
filter_urls=["https://langchain.readthedocs.io/en/stable/"],
)
documents = loader.load()
assert len(documents) == 1
assert "🦜🔗" in documents[0].page_content
Loading…
Cancel
Save