Newspaper (#8647)

- Description: Added newspaper3k based news article loader. Provide a list of urls. - Issue: N/A - Dependencies: newspaper3k, - Tag maintainer: @rlancemartin , @eyurtsev - Twitter handle: @ruze --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
1 year ago · 71f98db2fe
parent f68f3b23d7
commit 71f98db2fe
4 changed files with 386 additions and 0 deletions
--- a/docs/extras/integrations/document_loaders/news.ipynb
+++ b/docs/extras/integrations/document_loaders/news.ipynb
@ -0,0 +1,192 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2dfc4698",
+   "metadata": {},
+   "source": [
+    "# News URL\n",
+    "\n",
+    "This covers how to load HTML news articles from a list of URLs into a document format that we can use downstream."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "16c3699e",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-08-02T21:18:18.886031400Z",
+     "start_time": "2023-08-02T21:18:17.682345Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import NewsURLLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "836fbac1",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-08-02T21:18:18.895539800Z",
+     "start_time": "2023-08-02T21:18:18.895539800Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "urls = [\n",
+    "    \"https://www.bbc.com/news/world-us-canada-66388172\",\n",
+    "    \"https://www.bbc.com/news/entertainment-arts-66384971\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33089aba-ff74-4d00-8f40-9449c29587cc",
+   "metadata": {},
+   "source": [
+    "Pass in urls to load them into Documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "00f46fda",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-08-02T21:18:19.227074500Z",
+     "start_time": "2023-08-02T21:18:18.895539800Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First article:  page_content='In testimony to the congressional committee examining the 6 January riot, Mrs Powell said she did not review all of the many claims of election fraud she made, telling them that \"no reasonable person\" would view her claims as fact. Neither she nor her representatives have commented.' metadata={'title': 'Donald Trump indictment: What do we know about the six co-conspirators?', 'link': 'https://www.bbc.com/news/world-us-canada-66388172', 'authors': [], 'language': 'en', 'description': 'Six people accused of helping Mr Trump undermine the election have been described by prosecutors.', 'publish_date': None}\n",
+      "\n",
+      "Second article:  page_content='Ms Williams added: \"If there\\'s anything that I can do in my power to ensure that dancers or singers or whoever decides to work with her don\\'t have to go through that same experience, I\\'m going to do that.\"' metadata={'title': \"Lizzo dancers Arianna Davis and Crystal Williams: 'No one speaks out, they are scared'\", 'link': 'https://www.bbc.com/news/entertainment-arts-66384971', 'authors': [], 'language': 'en', 'description': 'The US pop star is being sued for sexual harassment and fat-shaming but has yet to comment.', 'publish_date': None}\n"
+     ]
+    }
+   ],
+   "source": [
+    "loader = NewsURLLoader(urls=urls)\n",
+    "data = loader.load()\n",
+    "print(\"First article: \", data[0])\n",
+    "print(\"\\nSecond article: \", data[1])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "Use nlp=True to run nlp analysis and generate keywords + summary"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "98ac26c488315bff"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "b68a26b3",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2023-08-02T21:18:19.585758200Z",
+     "start_time": "2023-08-02T21:18:19.227074500Z"
+    }
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "First article:  page_content='In testimony to the congressional committee examining the 6 January riot, Mrs Powell said she did not review all of the many claims of election fraud she made, telling them that \"no reasonable person\" would view her claims as fact. Neither she nor her representatives have commented.' metadata={'title': 'Donald Trump indictment: What do we know about the six co-conspirators?', 'link': 'https://www.bbc.com/news/world-us-canada-66388172', 'authors': [], 'language': 'en', 'description': 'Six people accused of helping Mr Trump undermine the election have been described by prosecutors.', 'publish_date': None, 'keywords': ['powell', 'know', 'donald', 'trump', 'review', 'indictment', 'telling', 'view', 'reasonable', 'person', 'testimony', 'coconspirators', 'riot', 'representatives', 'claims'], 'summary': 'In testimony to the congressional committee examining the 6 January riot, Mrs Powell said she did not review all of the many claims of election fraud she made, telling them that \"no reasonable person\" would view her claims as fact.\\nNeither she nor her representatives have commented.'}\n",
+      "\n",
+      "Second article:  page_content='Ms Williams added: \"If there\\'s anything that I can do in my power to ensure that dancers or singers or whoever decides to work with her don\\'t have to go through that same experience, I\\'m going to do that.\"' metadata={'title': \"Lizzo dancers Arianna Davis and Crystal Williams: 'No one speaks out, they are scared'\", 'link': 'https://www.bbc.com/news/entertainment-arts-66384971', 'authors': [], 'language': 'en', 'description': 'The US pop star is being sued for sexual harassment and fat-shaming but has yet to comment.', 'publish_date': None, 'keywords': ['davis', 'lizzo', 'singers', 'experience', 'crystal', 'ensure', 'arianna', 'theres', 'williams', 'power', 'going', 'dancers', 'im', 'speaks', 'work', 'ms', 'scared'], 'summary': 'Ms Williams added: \"If there\\'s anything that I can do in my power to ensure that dancers or singers or whoever decides to work with her don\\'t have to go through that same experience, I\\'m going to do that.\"'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "loader = NewsURLLoader(urls=urls, nlp=True)\n",
+    "data = loader.load()\n",
+    "print(\"First article: \", data[0])\n",
+    "print(\"\\nSecond article: \", data[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "['powell',\n 'know',\n 'donald',\n 'trump',\n 'review',\n 'indictment',\n 'telling',\n 'view',\n 'reasonable',\n 'person',\n 'testimony',\n 'coconspirators',\n 'riot',\n 'representatives',\n 'claims']"
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[0].metadata['keywords']"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-08-02T21:18:19.585758200Z",
+     "start_time": "2023-08-02T21:18:19.585758200Z"
+    }
+   },
+   "id": "ae37e004e0284b1d"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "outputs": [
+    {
+     "data": {
+      "text/plain": "'In testimony to the congressional committee examining the 6 January riot, Mrs Powell said she did not review all of the many claims of election fraud she made, telling them that \"no reasonable person\" would view her claims as fact.\\nNeither she nor her representatives have commented.'"
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[0].metadata['summary']"
+   ],
+   "metadata": {
+    "collapsed": false,
+    "ExecuteTime": {
+     "end_time": "2023-08-02T21:18:19.598966800Z",
+     "start_time": "2023-08-02T21:18:19.594950200Z"
+    }
+   },
+   "id": "7676155fb175e53e"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/libs/langchain/langchain/document_loaders/init.py
+++ b/libs/langchain/langchain/document_loaders/init.py
@ -95,6 +95,7 @@ from langchain.document_loaders.mediawikidump import MWDumpLoader
 from langchain.document_loaders.merge import MergedDataLoader
 from langchain.document_loaders.mhtml import MHTMLLoader
 from langchain.document_loaders.modern_treasury import ModernTreasuryLoader
+from langchain.document_loaders.news import NewsURLLoader
 from langchain.document_loaders.notebook import NotebookLoader
 from langchain.document_loaders.notion import NotionDirectoryLoader
 from langchain.document_loaders.notiondb import NotionDBLoader
@ -250,6 +251,7 @@ __all__ = [
    "MergedDataLoader",
    "MHTMLLoader",
    "ModernTreasuryLoader",
+    "NewsURLLoader",
    "NotebookLoader",
    "NotionDBLoader",
    "NotionDirectoryLoader",
--- a/libs/langchain/langchain/document_loaders/news.py
+++ b/libs/langchain/langchain/document_loaders/news.py
@ -0,0 +1,124 @@
+"""Loader that uses unstructured to load HTML files."""
+import logging
+from typing import Any, Iterator, List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+logger = logging.getLogger(__name__)
+
+
+class NewsURLLoader(BaseLoader):
+    """Loader that uses newspaper to load news articles from URLs.
+
+    Args:
+        urls: URLs to load. Each is loaded into its own document.
+        text_mode: If True, extract text from URL and use that for page content.
+            Otherwise, extract raw HTML.
+        nlp: If True, perform NLP on the extracted contents, like providing a summary
+            and extracting keywords.
+        continue_on_failure: If True, continue loading documents even if
+            loading fails for a particular URL.
+        show_progress_bar: If True, use tqdm to show a loading progress bar. Requires
+            tqdm to be installed, ``pip install tqdm``.
+        **newspaper_kwargs: Any additional named arguments to pass to
+            newspaper.Article().
+
+    Example:
+        .. code-block:: python
+
+            from langchain.document_loaders import NewsURLLoader
+
+            loader = NewsURLLoader(
+                urls=["<url-1>", "<url-2>"],
+            )
+            docs = loader.load()
+
+    Newspaper reference:
+        https://newspaper.readthedocs.io/en/latest/
+    """
+
+    def __init__(
+        self,
+        urls: List[str],
+        text_mode: bool = True,
+        nlp: bool = False,
+        continue_on_failure: bool = True,
+        show_progress_bar: bool = False,
+        **newspaper_kwargs: Any,
+    ) -> None:
+        """Initialize with file path."""
+        try:
+            import newspaper  # noqa:F401
+
+            self.__version = newspaper.__version__
+        except ImportError:
+            raise ImportError(
+                "newspaper package not found, please install it with "
+                "`pip install newspaper3k`"
+            )
+
+        self.urls = urls
+        self.text_mode = text_mode
+        self.nlp = nlp
+        self.continue_on_failure = continue_on_failure
+        self.newspaper_kwargs = newspaper_kwargs
+        self.show_progress_bar = show_progress_bar
+
+    def load(self) -> List[Document]:
+        iter = self.lazy_load()
+        if self.show_progress_bar:
+            try:
+                from tqdm import tqdm
+            except ImportError as e:
+                raise ImportError(
+                    "Package tqdm must be installed if show_progress_bar=True. "
+                    "Please install with 'pip install tqdm' or set "
+                    "show_progress_bar=False."
+                ) from e
+            iter = tqdm(iter)
+        return list(iter)
+
+    def lazy_load(self) -> Iterator[Document]:
+        try:
+            from newspaper import Article
+        except ImportError as e:
+            raise ImportError(
+                "Cannot import newspaper, please install with `pip install newspaper3k`"
+            ) from e
+
+        for url in self.urls:
+            try:
+                article = Article(url, **self.newspaper_kwargs)
+                article.download()
+                article.parse()
+
+                if self.nlp:
+                    article.nlp()
+
+            except Exception as e:
+                if self.continue_on_failure:
+                    logger.error(f"Error fetching or processing {url}, exception: {e}")
+                    continue
+                else:
+                    raise e
+
+            metadata = {
+                "title": getattr(article, "title", ""),
+                "link": getattr(article, "url", getattr(article, "canonical_link", "")),
+                "authors": getattr(article, "authors", []),
+                "language": getattr(article, "meta_lang", ""),
+                "description": getattr(article, "meta_description", ""),
+                "publish_date": getattr(article, "publish_date", ""),
+            }
+
+            if self.text_mode:
+                content = article.text
+            else:
+                content = article.html
+
+            if self.nlp:
+                metadata["keywords"] = getattr(article, "keywords", [])
+                metadata["summary"] = getattr(article, "summary", "")
+
+            yield Document(page_content=content, metadata=metadata)
--- a/libs/langchain/tests/integration_tests/document_loaders/test_news.py
+++ b/libs/langchain/tests/integration_tests/document_loaders/test_news.py
@ -0,0 +1,68 @@
+import random
+
+import pytest
+import requests
+from bs4 import BeautifulSoup
+
+from langchain.document_loaders import NewsURLLoader
+
+
+def get_random_news_url() -> str:
+    response = requests.get("https://news.google.com")
+    soup = BeautifulSoup(response.text, "html.parser")
+
+    article_links = [
+        a["href"] for a in soup.find_all("a", href=True) if "/articles/" in a["href"]
+    ]
+    random_article_link = random.choice(article_links)
+
+    return "https://news.google.com" + random_article_link
+
+
+def test_news_loader() -> None:
+    loader = NewsURLLoader([get_random_news_url()])
+    docs = loader.load()
+
+    assert docs[0] is not None
+    assert hasattr(docs[0], "page_content")
+    assert hasattr(docs[0], "metadata")
+
+    metadata = docs[0].metadata
+    assert "title" in metadata
+    assert "link" in metadata
+    assert "authors" in metadata
+    assert "language" in metadata
+    assert "description" in metadata
+    assert "publish_date" in metadata
+
+
+def test_news_loader_with_nlp() -> None:
+    loader = NewsURLLoader([get_random_news_url()], nlp=True)
+    docs = loader.load()
+
+    assert docs[0] is not None
+    assert hasattr(docs[0], "page_content")
+    assert hasattr(docs[0], "metadata")
+
+    metadata = docs[0].metadata
+    assert "title" in metadata
+    assert "link" in metadata
+    assert "authors" in metadata
+    assert "language" in metadata
+    assert "description" in metadata
+    assert "publish_date" in metadata
+    assert "keywords" in metadata
+    assert "summary" in metadata
+
+
+def test_continue_on_failure_true() -> None:
+    """Test exception is not raised when continue_on_failure=True."""
+    loader = NewsURLLoader(["badurl.foobar"])
+    loader.load()
+
+
+def test_continue_on_failure_false() -> None:
+    """Test exception is raised when continue_on_failure=False."""
+    loader = NewsURLLoader(["badurl.foobar"], continue_on_failure=False)
+    with pytest.raises(Exception):
+        loader.load()