From 872605a5c5cddfef4b414186235c906ba8e8e13e Mon Sep 17 00:00:00 2001 From: Martin Holzhauer Date: Tue, 9 May 2023 19:18:33 +0200 Subject: [PATCH] Add an option to extract more metadata from crawled websites (#4347) This pr makes it possible to extract more metadata from websites for later use. my usecase: parsing ld+json or microdata from sites and store it as structured data in the metadata field --- langchain/document_loaders/sitemap.py | 11 ++++- .../document_loaders/test_sitemap.py | 44 +++++++++++++++++++ 2 files changed, 54 insertions(+), 1 deletion(-) diff --git a/langchain/document_loaders/sitemap.py b/langchain/document_loaders/sitemap.py index 2b184f38..7e3d3e41 100644 --- a/langchain/document_loaders/sitemap.py +++ b/langchain/document_loaders/sitemap.py @@ -11,6 +11,10 @@ def _default_parsing_function(content: Any) -> str: return str(content.get_text()) +def _default_meta_function(meta: dict, _content: Any) -> dict: + return {"source": meta["loc"], **meta} + + def _batch_block(iterable: Iterable, size: int) -> Generator[List[dict], None, None]: it = iter(iterable) while item := list(itertools.islice(it, size)): @@ -27,6 +31,7 @@ class SitemapLoader(WebBaseLoader): parsing_function: Optional[Callable] = None, blocksize: Optional[int] = None, blocknum: int = 0, + meta_function: Optional[Callable] = None, ): """Initialize with webpage path and optional filter URLs. @@ -37,6 +42,9 @@ class SitemapLoader(WebBaseLoader): parsing_function: Function to parse bs4.Soup output blocksize: number of sitemap locations per block blocknum: the number of the block that should be loaded - zero indexed + meta_function: Function to parse bs4.Soup output for metadata + remember when setting this method to also copy metadata["loc"] + to metadata["source"] if you are using this field """ if blocksize is not None and blocksize < 1: @@ -56,6 +64,7 @@ class SitemapLoader(WebBaseLoader): self.filter_urls = filter_urls self.parsing_function = parsing_function or _default_parsing_function + self.meta_function = meta_function or _default_meta_function self.blocksize = blocksize self.blocknum = blocknum @@ -110,7 +119,7 @@ class SitemapLoader(WebBaseLoader): return [ Document( page_content=self.parsing_function(results[i]), - metadata={**{"source": els[i]["loc"]}, **els[i]}, + metadata=self.meta_function(els[i], results[i]), ) for i in range(len(results)) ] diff --git a/tests/integration_tests/document_loaders/test_sitemap.py b/tests/integration_tests/document_loaders/test_sitemap.py index 3ac2a59e..b5cb98f3 100644 --- a/tests/integration_tests/document_loaders/test_sitemap.py +++ b/tests/integration_tests/document_loaders/test_sitemap.py @@ -1,3 +1,5 @@ +from typing import Any + import pytest from langchain.document_loaders import SitemapLoader @@ -78,3 +80,45 @@ def test_filter_sitemap() -> None: documents = loader.load() assert len(documents) == 1 assert "🦜🔗" in documents[0].page_content + + +def test_sitemap_metadata() -> None: + def sitemap_metadata_one(meta: dict, _content: None) -> dict: + return {**meta, "mykey": "Super Important Metadata"} + + """Test sitemap loader.""" + loader = SitemapLoader( + "https://langchain.readthedocs.io/sitemap.xml", + meta_function=sitemap_metadata_one, + ) + documents = loader.load() + assert len(documents) > 1 + assert "mykey" in documents[0].metadata + assert "Super Important Metadata" in documents[0].metadata["mykey"] + + +def test_sitemap_metadata_extraction() -> None: + def sitemap_metadata_two(meta: dict, content: Any) -> dict: + title = content.find("title") + if title: + return {**meta, "title": title.get_text()} + return meta + + """Test sitemap loader.""" + loader = SitemapLoader( + "https://langchain.readthedocs.io/sitemap.xml", + meta_function=sitemap_metadata_two, + ) + documents = loader.load() + assert len(documents) > 1 + assert "title" in documents[0].metadata + assert "LangChain" in documents[0].metadata["title"] + + +def test_sitemap_metadata_default() -> None: + """Test sitemap loader.""" + loader = SitemapLoader("https://langchain.readthedocs.io/sitemap.xml") + documents = loader.load() + assert len(documents) > 1 + assert "source" in documents[0].metadata + assert "loc" in documents[0].metadata