From fea5619ce9f7f9170c7d3c126d4ac34742aeffe8 Mon Sep 17 00:00:00 2001 From: Tim Asp <707699+timothyasp@users.noreply.github.com> Date: Sat, 15 Apr 2023 16:07:08 -0700 Subject: [PATCH] Add title, lang, description to Web loader document metadata (#2955) Title, lang and description are on almost every web page, and are incredibly useful pieces of information that currently isn't captured with the current web base loader I thought about adding the title and description to the content of the document, as that content could be useful in search, but I left it out for right now. If you think it'd be worth adding, happy to add it. I've found it's nice to have the title/description in the metadata to have some structured data when retrieving rows from vectordbs for use with summary and source citation, so if we do want to add it to the `page_content`, i'd advocate for it to also be included in metadata. --- langchain/document_loaders/web_base.py | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/langchain/document_loaders/web_base.py b/langchain/document_loaders/web_base.py index f82e6a92..d51f458b 100644 --- a/langchain/document_loaders/web_base.py +++ b/langchain/document_loaders/web_base.py @@ -23,6 +23,18 @@ default_header_template = { } +def _build_metadata(soup: Any, url: str) -> dict: + """Build metadata from BeautifulSoup output.""" + metadata = {"source": url} + if title := soup.find("title"): + metadata["title"] = title.get_text() + if description := soup.find("meta", attrs={"name": "description"}): + metadata["description"] = description.get("content", None) + if html := soup.find("html"): + metadata["language"] = html.get("lang", None) + return metadata + + class WebBaseLoader(BaseLoader): """Loader that uses urllib and beautiful soup to load webpages.""" @@ -148,7 +160,7 @@ class WebBaseLoader(BaseLoader): for path in self.web_paths: soup = self._scrape(path) text = soup.get_text() - metadata = {"source": path} + metadata = _build_metadata(soup, path) docs.append(Document(page_content=text, metadata=metadata)) return docs @@ -159,8 +171,9 @@ class WebBaseLoader(BaseLoader): results = self.scrape_all(self.web_paths) docs = [] for i in range(len(results)): - text = results[i].get_text() - metadata = {"source": self.web_paths[i]} + soup = results[i] + text = soup.get_text() + metadata = _build_metadata(soup, self.web_paths[i]) docs.append(Document(page_content=text, metadata=metadata)) return docs