Add title, lang, description to Web loader document metadata (#2955)

Title, lang and description are on almost every web page, and are
incredibly useful pieces of information that currently isn't captured
with the current web base loader

I thought about adding the title and description to the content of the
document, as
that content could be useful in search, but I left it out for right now.
If you think
it'd be worth adding, happy to add it.


I've found it's nice to have the title/description in the metadata to
have some structured data
when retrieving rows from vectordbs for use with summary and source
citation, so if we do want to add it to the `page_content`, i'd advocate
for it to also be included in metadata.
fix_agent_callbacks
Tim Asp 1 year ago committed by GitHub
parent f7bf917baf
commit fea5619ce9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -23,6 +23,18 @@ default_header_template = {
}
def _build_metadata(soup: Any, url: str) -> dict:
"""Build metadata from BeautifulSoup output."""
metadata = {"source": url}
if title := soup.find("title"):
metadata["title"] = title.get_text()
if description := soup.find("meta", attrs={"name": "description"}):
metadata["description"] = description.get("content", None)
if html := soup.find("html"):
metadata["language"] = html.get("lang", None)
return metadata
class WebBaseLoader(BaseLoader):
"""Loader that uses urllib and beautiful soup to load webpages."""
@ -148,7 +160,7 @@ class WebBaseLoader(BaseLoader):
for path in self.web_paths:
soup = self._scrape(path)
text = soup.get_text()
metadata = {"source": path}
metadata = _build_metadata(soup, path)
docs.append(Document(page_content=text, metadata=metadata))
return docs
@ -159,8 +171,9 @@ class WebBaseLoader(BaseLoader):
results = self.scrape_all(self.web_paths)
docs = []
for i in range(len(results)):
text = results[i].get_text()
metadata = {"source": self.web_paths[i]}
soup = results[i]
text = soup.get_text()
metadata = _build_metadata(soup, self.web_paths[i])
docs.append(Document(page_content=text, metadata=metadata))
return docs

Loading…
Cancel
Save