From c08b622b2d33c99b05ce357f79fb3f9c9694021f Mon Sep 17 00:00:00 2001 From: SteveLiao <32828303+kkk55596@users.noreply.github.com> Date: Fri, 27 Oct 2023 11:22:31 +0800 Subject: [PATCH] Add HTML Title and Page Language into metadata for AsyncHtmlLoader (#11326) **Description:** Revise `libs/langchain/langchain/document_loaders/async_html.py` to store the HTML Title and Page Language in the `metadata` of `AsyncHtmlLoader`. --- .../langchain/document_loaders/async_html.py | 56 ++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/libs/langchain/langchain/document_loaders/async_html.py b/libs/langchain/langchain/document_loaders/async_html.py index 80436bfec6..877a9c41a9 100644 --- a/libs/langchain/langchain/document_loaders/async_html.py +++ b/libs/langchain/langchain/document_loaders/async_html.py @@ -24,6 +24,18 @@ default_header_template = { } +def _build_metadata(soup: Any, url: str) -> dict: + """Build metadata from BeautifulSoup output.""" + metadata = {"source": url} + if title := soup.find("title"): + metadata["title"] = title.get_text() + if description := soup.find("meta", attrs={"name": "description"}): + metadata["description"] = description.get("content", "No description found.") + if html := soup.find("html"): + metadata["language"] = html.get("lang", "No language found.") + return metadata + + class AsyncHtmlLoader(BaseLoader): """Load `HTML` asynchronously.""" @@ -33,6 +45,9 @@ class AsyncHtmlLoader(BaseLoader): header_template: Optional[dict] = None, verify_ssl: Optional[bool] = True, proxies: Optional[dict] = None, + autoset_encoding: bool = True, + encoding: Optional[str] = None, + default_parser: str = "html.parser", requests_per_second: int = 2, requests_kwargs: Optional[Dict[str, Any]] = None, raise_for_status: bool = False, @@ -68,8 +83,46 @@ class AsyncHtmlLoader(BaseLoader): self.session.proxies.update(proxies) self.requests_per_second = requests_per_second + self.default_parser = default_parser self.requests_kwargs = requests_kwargs or {} self.raise_for_status = raise_for_status + self.autoset_encoding = autoset_encoding + self.encoding = encoding + + @staticmethod + def _check_parser(parser: str) -> None: + """Check that parser is valid for bs4.""" + valid_parsers = ["html.parser", "lxml", "xml", "lxml-xml", "html5lib"] + if parser not in valid_parsers: + raise ValueError( + "`parser` must be one of " + ", ".join(valid_parsers) + "." + ) + + def _scrape( + self, + url: str, + parser: Union[str, None] = None, + bs_kwargs: Optional[dict] = None, + ) -> Any: + from bs4 import BeautifulSoup + + if parser is None: + if url.endswith(".xml"): + parser = "xml" + else: + parser = self.default_parser + + self._check_parser(parser) + + html_doc = self.session.get(url, **self.requests_kwargs) + if self.raise_for_status: + html_doc.raise_for_status() + + if self.encoding is not None: + html_doc.encoding = self.encoding + elif self.autoset_encoding: + html_doc.encoding = html_doc.apparent_encoding + return BeautifulSoup(html_doc.text, parser, **(bs_kwargs or {})) async def _fetch( self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5 @@ -142,7 +195,8 @@ class AsyncHtmlLoader(BaseLoader): results = asyncio.run(self.fetch_all(self.web_paths)) docs = [] for i, text in enumerate(cast(List[str], results)): - metadata = {"source": self.web_paths[i]} + soup = self._scrape(self.web_paths[i]) + metadata = _build_metadata(soup, self.web_paths[i]) docs.append(Document(page_content=text, metadata=metadata)) return docs