Add HTML Title and Page Language into metadata for AsyncHtmlLoader (#11326)

**Description:** 
Revise `libs/langchain/langchain/document_loaders/async_html.py` to
store the HTML Title and Page Language in the `metadata` of
`AsyncHtmlLoader`.
pull/12140/head^2
SteveLiao 8 months ago committed by GitHub
parent 4b16601d33
commit c08b622b2d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -24,6 +24,18 @@ default_header_template = {
}
def _build_metadata(soup: Any, url: str) -> dict:
"""Build metadata from BeautifulSoup output."""
metadata = {"source": url}
if title := soup.find("title"):
metadata["title"] = title.get_text()
if description := soup.find("meta", attrs={"name": "description"}):
metadata["description"] = description.get("content", "No description found.")
if html := soup.find("html"):
metadata["language"] = html.get("lang", "No language found.")
return metadata
class AsyncHtmlLoader(BaseLoader):
"""Load `HTML` asynchronously."""
@ -33,6 +45,9 @@ class AsyncHtmlLoader(BaseLoader):
header_template: Optional[dict] = None,
verify_ssl: Optional[bool] = True,
proxies: Optional[dict] = None,
autoset_encoding: bool = True,
encoding: Optional[str] = None,
default_parser: str = "html.parser",
requests_per_second: int = 2,
requests_kwargs: Optional[Dict[str, Any]] = None,
raise_for_status: bool = False,
@ -68,8 +83,46 @@ class AsyncHtmlLoader(BaseLoader):
self.session.proxies.update(proxies)
self.requests_per_second = requests_per_second
self.default_parser = default_parser
self.requests_kwargs = requests_kwargs or {}
self.raise_for_status = raise_for_status
self.autoset_encoding = autoset_encoding
self.encoding = encoding
@staticmethod
def _check_parser(parser: str) -> None:
"""Check that parser is valid for bs4."""
valid_parsers = ["html.parser", "lxml", "xml", "lxml-xml", "html5lib"]
if parser not in valid_parsers:
raise ValueError(
"`parser` must be one of " + ", ".join(valid_parsers) + "."
)
def _scrape(
self,
url: str,
parser: Union[str, None] = None,
bs_kwargs: Optional[dict] = None,
) -> Any:
from bs4 import BeautifulSoup
if parser is None:
if url.endswith(".xml"):
parser = "xml"
else:
parser = self.default_parser
self._check_parser(parser)
html_doc = self.session.get(url, **self.requests_kwargs)
if self.raise_for_status:
html_doc.raise_for_status()
if self.encoding is not None:
html_doc.encoding = self.encoding
elif self.autoset_encoding:
html_doc.encoding = html_doc.apparent_encoding
return BeautifulSoup(html_doc.text, parser, **(bs_kwargs or {}))
async def _fetch(
self, url: str, retries: int = 3, cooldown: int = 2, backoff: float = 1.5
@ -142,7 +195,8 @@ class AsyncHtmlLoader(BaseLoader):
results = asyncio.run(self.fetch_all(self.web_paths))
docs = []
for i, text in enumerate(cast(List[str], results)):
metadata = {"source": self.web_paths[i]}
soup = self._scrape(self.web_paths[i])
metadata = _build_metadata(soup, self.web_paths[i])
docs.append(Document(page_content=text, metadata=metadata))
return docs

Loading…
Cancel
Save