langchain/libs/community/langchain_community/document_loaders/hn.py

from typing import Any, List

from langchain_core.documents import Document

from langchain_community.document_loaders.web_base import WebBaseLoader


class HNLoader(WebBaseLoader):
    """Load `Hacker News` data.

    It loads data from either main page results or the comments page."""

    def load(self) -> List[Document]:
        """Get important HN webpage information.

        HN webpage components are:
            - title
            - content
            - source url,
            - time of post
            - author of the post
            - number of comments
            - rank of the post
        """
        soup_info = self.scrape()
        if "item" in self.web_path:
            return self.load_comments(soup_info)
        else:
            return self.load_results(soup_info)

    def load_comments(self, soup_info: Any) -> List[Document]:
        """Load comments from a HN post."""
        comments = soup_info.select("tr[class='athing comtr']")
        title = soup_info.select_one("tr[id='pagespace']").get("title")
        return [
            Document(
                page_content=comment.text.strip(),
                metadata={"source": self.web_path, "title": title},
            )
            for comment in comments
        ]

    def load_results(self, soup: Any) -> List[Document]:
        """Load items from an HN page."""
        items = soup.select("tr[class='athing']")
        documents = []
        for lineItem in items:
            ranking = lineItem.select_one("span[class='rank']").text
            link = lineItem.find("span", {"class": "titleline"}).find("a").get("href")
            title = lineItem.find("span", {"class": "titleline"}).text.strip()
            metadata = {
                "source": self.web_path,
                "title": title,
                "link": link,
                "ranking": ranking,
            }
            documents.append(
                Document(
                    page_content=title, link=link, ranking=ranking, metadata=metadata
                )
            )
        return documents
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 7 months ago			`from typing import Any, List`

			`from langchain_core.documents import Document`

			`from langchain_community.document_loaders.web_base import WebBaseLoader`


			`class HNLoader(WebBaseLoader):`
			"""Load `Hacker News` data.

			`It loads data from either main page results or the comments page."""`

			`def load(self) -> List[Document]:`
			`"""Get important HN webpage information.`

			`HN webpage components are:`
			`- title`
			`- content`
			`- source url,`
			`- time of post`
			`- author of the post`
			`- number of comments`
			`- rank of the post`
			`"""`
			`soup_info = self.scrape()`
			`if "item" in self.web_path:`
			`return self.load_comments(soup_info)`
			`else:`
			`return self.load_results(soup_info)`

			`def load_comments(self, soup_info: Any) -> List[Document]:`
			`"""Load comments from a HN post."""`
			`comments = soup_info.select("tr[class='athing comtr']")`
			`title = soup_info.select_one("tr[id='pagespace']").get("title")`
			`return [`
			`Document(`
			`page_content=comment.text.strip(),`
			`metadata={"source": self.web_path, "title": title},`
			`)`
			`for comment in comments`
			`]`

			`def load_results(self, soup: Any) -> List[Document]:`
			`"""Load items from an HN page."""`
			`items = soup.select("tr[class='athing']")`
			`documents = []`
			`for lineItem in items:`
			`ranking = lineItem.select_one("span[class='rank']").text`
			`link = lineItem.find("span", {"class": "titleline"}).find("a").get("href")`
			`title = lineItem.find("span", {"class": "titleline"}).text.strip()`
			`metadata = {`
			`"source": self.web_path,`
			`"title": title,`
			`"link": link,`
			`"ranking": ranking,`
			`}`
			`documents.append(`
			`Document(`
			`page_content=title, link=link, ranking=ranking, metadata=metadata`
			`)`
			`)`
			`return documents`