RSS Feed / OPML loader (#8694)

Replace this comment with: - Description: added a document loader for a list of RSS feeds or OPML. It iterates through the list and uses NewsURLLoader to load each article. - Issue: N/A - Dependencies: feedparser, listparser - Tag maintainer: @rlancemartin, @eyurtsev - Twitter handle: @ruze --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
10 months ago · 8ef7e14a85
parent 53e4148a1b
commit 8ef7e14a85
10 changed files with 699 additions and 20 deletions
--- a/docs/extras/integrations/document_loaders/example_data/sample_rss_feeds.opml
+++ b/docs/extras/integrations/document_loaders/example_data/sample_rss_feeds.opml
@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<opml version="1.0">
+    <head>
+        <title>Sample RSS feed subscriptions</title>
+    </head>
+    <body>
+        <outline text="Tech" title="Tech">
+            <outline type="rss" text="Engadget" title="Engadget" xmlUrl="http://www.engadget.com/rss-full.xml" htmlUrl="http://www.engadget.com"/>
+            <outline type="rss" text="Ars Technica - All content" title="Ars Technica - All content" xmlUrl="http://feeds.arstechnica.com/arstechnica/index/" htmlUrl="https://arstechnica.com"/>
+        </outline>
+    </body>
+</opml>
--- a/docs/extras/integrations/document_loaders/rss.ipynb
+++ b/docs/extras/integrations/document_loaders/rss.ipynb
@ -0,0 +1,170 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2dfc4698",
+   "metadata": {},
+   "source": [
+    "# RSS Feeds\n",
+    "\n",
+    "This covers how to load HTML news articles from a list of RSS feed URLs into a document format that we can use downstream."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "16c3699e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import RSSFeedLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "836fbac1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "urls = [\"https://www.engadget.com/rss.xml\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33089aba-ff74-4d00-8f40-9449c29587cc",
+   "metadata": {},
+   "source": [
+    "Pass in urls to load them into Documents"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00f46fda",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = RSSFeedLoader(urls=urls)\n",
+    "data = loader.load()\n",
+    "print(len(data))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "data[0]"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "b447468cc42266d0"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "You can pass arguments to the NewsURLLoader which it uses to load articles."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "c36d3b0d329faf2a"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "loader = RSSFeedLoader(urls=urls, nlp=True)\n",
+    "data = loader.load()\n",
+    "print(len(data))"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "5fdada62470d3019"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "data[0].metadata['keywords']"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "11d71963f7735c1d"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "data[0].metadata['summary']"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "9fb64ba0e8780966"
+  },
+  {
+   "cell_type": "markdown",
+   "source": [
+    "You can also use an OPML file such as a Feedly export.  Pass in either a URL or the OPML contents."
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "98ac26c488315bff"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8b6f07ae526a897c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"example_data/sample_rss_feeds.opml\", \"r\") as f:\n",
+    "    loader = RSSFeedLoader(opml=f.read())\n",
+    "data = loader.load()\n",
+    "print(len(data))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "outputs": [],
+   "source": [
+    "data[0]"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "b68a26b3"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/libs/langchain/langchain/document_loaders/init.py
+++ b/libs/langchain/langchain/document_loaders/init.py
@ -128,6 +128,7 @@ from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
 from langchain.document_loaders.reddit import RedditPostsLoader
 from langchain.document_loaders.roam import RoamLoader
 from langchain.document_loaders.rocksetdb import RocksetLoader
+from langchain.document_loaders.rss import RSSFeedLoader
 from langchain.document_loaders.rst import UnstructuredRSTLoader
 from langchain.document_loaders.rtf import UnstructuredRTFLoader
 from langchain.document_loaders.s3_directory import S3DirectoryLoader
@ -280,6 +281,7 @@ __all__ = [
    "RedditPostsLoader",
    "RoamLoader",
    "RocksetLoader",
+    "RSSFeedLoader",
    "S3DirectoryLoader",
    "S3FileLoader",
    "SRTLoader",
--- a/libs/langchain/langchain/document_loaders/rss.py
+++ b/libs/langchain/langchain/document_loaders/rss.py
@ -0,0 +1,133 @@
+"""Loader that uses unstructured to load HTML files."""
+import logging
+from typing import Any, Iterator, List, Optional, Sequence
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+from langchain.document_loaders.news import NewsURLLoader
+
+logger = logging.getLogger(__name__)
+
+
+class RSSFeedLoader(BaseLoader):
+    """Loader that uses newspaper to load news articles from RSS feeds.
+
+    Args:
+        urls: URLs for RSS feeds to load. Each articles in the feed is loaded into its own document.
+        opml: OPML file to load feed urls from. Only one of urls or opml should be provided.  The value
+        can be a URL string, or OPML markup contents as byte or string.
+        continue_on_failure: If True, continue loading documents even if
+            loading fails for a particular URL.
+        show_progress_bar: If True, use tqdm to show a loading progress bar. Requires
+            tqdm to be installed, ``pip install tqdm``.
+        **newsloader_kwargs: Any additional named arguments to pass to
+            NewsURLLoader.
+
+    Example:
+        .. code-block:: python
+
+            from langchain.document_loaders import RSSFeedLoader
+
+            loader = RSSFeedLoader(
+                urls=["<url-1>", "<url-2>"],
+            )
+            docs = loader.load()
+
+    The loader uses feedparser to parse RSS feeds.  The feedparser library is not installed by default so you should
+    install it if using this loader:
+    https://pythonhosted.org/feedparser/
+
+    If you use OPML, you should also install listparser:
+    https://pythonhosted.org/listparser/
+
+    Finally, newspaper is used to process each article:
+    https://newspaper.readthedocs.io/en/latest/
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        urls: Optional[Sequence[str]] = None,
+        opml: Optional[str] = None,
+        continue_on_failure: bool = True,
+        show_progress_bar: bool = False,
+        **newsloader_kwargs: Any,
+    ) -> None:
+        """Initialize with urls or OPML."""
+        if (urls is None) == (
+            opml is None
+        ):  # This is True if both are None or neither is None
+            raise ValueError(
+                "Provide either the urls or the opml argument, but not both."
+            )
+        self.urls = urls
+        self.opml = opml
+        self.continue_on_failure = continue_on_failure
+        self.show_progress_bar = show_progress_bar
+        self.newsloader_kwargs = newsloader_kwargs
+
+    def load(self) -> List[Document]:
+        iter = self.lazy_load()
+        if self.show_progress_bar:
+            try:
+                from tqdm import tqdm
+            except ImportError as e:
+                raise ImportError(
+                    "Package tqdm must be installed if show_progress_bar=True. "
+                    "Please install with 'pip install tqdm' or set "
+                    "show_progress_bar=False."
+                ) from e
+            iter = tqdm(iter)
+        return list(iter)
+
+    @property
+    def _get_urls(self) -> Sequence[str]:
+        if self.urls:
+            return self.urls
+        try:
+            import listparser
+        except ImportError as e:
+            raise ImportError(
+                "Package listparser must be installed if the opml arg is used. "
+                "Please install with 'pip install listparser' or use the "
+                "urls arg instead."
+            ) from e
+        rss = listparser.parse(self.opml)
+        return [feed.url for feed in rss.feeds]
+
+    def lazy_load(self) -> Iterator[Document]:
+        try:
+            import feedparser  # noqa:F401
+        except ImportError:
+            raise ImportError(
+                "feedparser package not found, please install it with "
+                "`pip install feedparser`"
+            )
+
+        for url in self._get_urls:
+            try:
+                feed = feedparser.parse(url)
+                if getattr(feed, "bozo", False):
+                    raise ValueError(
+                        f"Error fetching {url}, exception: {feed.bozo_exception}"
+                    )
+            except Exception as e:
+                if self.continue_on_failure:
+                    logger.error(f"Error fetching {url}, exception: {e}")
+                    continue
+                else:
+                    raise e
+            try:
+                for entry in feed.entries:
+                    loader = NewsURLLoader(
+                        urls=[entry.link],
+                        **self.newsloader_kwargs,
+                    )
+                    article = loader.load()[0]
+                    article.metadata["feed"] = url
+                    yield article
+            except Exception as e:
+                if self.continue_on_failure:
+                    logger.error(f"Error processing entry {entry.link}, exception: {e}")
+                    continue
+                else:
+                    raise e
--- a/libs/langchain/poetry.lock
+++ b/libs/langchain/poetry.lock
@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.

 [[package]]
 name = "absl-py"
@ -2158,6 +2158,18 @@ ssh = ["bcrypt (>=3.1.5)"]
 test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
 test-randomorder = ["pytest-randomly"]

+[[package]]
+name = "cssselect"
+version = "1.2.0"
+description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0"
+category = "main"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "cssselect-1.2.0-py2.py3-none-any.whl", hash = "sha256:da1885f0c10b60c03ed5eccbb6b68d6eff248d91976fcde348f395d54c9fd35e"},
+    {file = "cssselect-1.2.0.tar.gz", hash = "sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc"},
+]
+
 [[package]]
 name = "cycler"
 version = "0.11.0"
@ -2838,6 +2850,22 @@ files = [
 [package.extras]
 devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"]

+[[package]]
+name = "feedfinder2"
+version = "0.0.4"
+description = "Find the feed URLs for a website."
+category = "main"
+optional = true
+python-versions = "*"
+files = [
+    {file = "feedfinder2-0.0.4.tar.gz", hash = "sha256:3701ee01a6c85f8b865a049c30ba0b4608858c803fe8e30d1d289fdbe89d0efe"},
+]
+
+[package.dependencies]
+beautifulsoup4 = "*"
+requests = "*"
+six = "*"
+
 [[package]]
 name = "feedparser"
 version = "6.0.10"
@ -4378,6 +4406,17 @@ docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alab
 qa = ["flake8 (==3.8.3)", "mypy (==0.782)"]
 testing = ["Django (<3.1)", "attrs", "colorama", "docopt", "pytest (<7.0.0)"]

+[[package]]
+name = "jieba3k"
+version = "0.35.1"
+description = "Chinese Words Segementation Utilities"
+category = "main"
+optional = true
+python-versions = "*"
+files = [
+    {file = "jieba3k-0.35.1.zip", hash = "sha256:980a4f2636b778d312518066be90c7697d410dd5a472385f5afced71a2db1c10"},
+]
+
 [[package]]
 name = "jina"
 version = "3.14.1"
@ -6472,6 +6511,33 @@ doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.2)", "pydata-sphinx-
 extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.9)", "sympy (>=1.10)"]
 test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]

+[[package]]
+name = "newspaper3k"
+version = "0.2.8"
+description = "Simplified python article discovery & extraction."
+category = "main"
+optional = true
+python-versions = "*"
+files = [
+    {file = "newspaper3k-0.2.8-py3-none-any.whl", hash = "sha256:44a864222633d3081113d1030615991c3dbba87239f6bbf59d91240f71a22e3e"},
+    {file = "newspaper3k-0.2.8.tar.gz", hash = "sha256:9f1bd3e1fb48f400c715abf875cc7b0a67b7ddcd87f50c9aeeb8fcbbbd9004fb"},
+]
+
+[package.dependencies]
+beautifulsoup4 = ">=4.4.1"
+cssselect = ">=0.9.2"
+feedfinder2 = ">=0.0.4"
+feedparser = ">=5.2.1"
+jieba3k = ">=0.35.1"
+lxml = ">=3.6.0"
+nltk = ">=3.2.1"
+Pillow = ">=3.3.0"
+python-dateutil = ">=2.5.3"
+PyYAML = ">=3.11"
+requests = ">=2.10.0"
+tinysegmenter = "0.3"
+tldextract = ">=2.0.1"
+
 [[package]]
 name = "nlpcloud"
 version = "1.0.42"
@ -10001,6 +10067,22 @@ urllib3 = ">=1.21.1,<1.27"
 socks = ["PySocks (>=1.5.6,!=1.5.7)"]
 use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]

+[[package]]
+name = "requests-file"
+version = "1.5.1"
+description = "File transport adapter for Requests"
+category = "main"
+optional = true
+python-versions = "*"
+files = [
+    {file = "requests-file-1.5.1.tar.gz", hash = "sha256:07d74208d3389d01c38ab89ef403af0cfec63957d53a0081d8eca738d0247d8e"},
+    {file = "requests_file-1.5.1-py2.py3-none-any.whl", hash = "sha256:dfe5dae75c12481f68ba353183c53a65e6044c923e64c24b2209f6c7570ca953"},
+]
+
+[package.dependencies]
+requests = ">=1.0.0"
+six = "*"
+
 [[package]]
 name = "requests-oauthlib"
 version = "1.3.1"
@ -11708,6 +11790,35 @@ webencodings = ">=0.4"
 doc = ["sphinx", "sphinx_rtd_theme"]
 test = ["flake8", "isort", "pytest"]

+[[package]]
+name = "tinysegmenter"
+version = "0.3"
+description = "Very compact Japanese tokenizer"
+category = "main"
+optional = true
+python-versions = "*"
+files = [
+    {file = "tinysegmenter-0.3.tar.gz", hash = "sha256:ed1f6d2e806a4758a73be589754384cbadadc7e1a414c81a166fc9adf2d40c6d"},
+]
+
+[[package]]
+name = "tldextract"
+version = "3.4.4"
+description = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well."
+category = "main"
+optional = true
+python-versions = ">=3.7"
+files = [
+    {file = "tldextract-3.4.4-py3-none-any.whl", hash = "sha256:581e7dbefc90e7bb857bb6f768d25c811a3c5f0892ed56a9a2999ddb7b1b70c2"},
+    {file = "tldextract-3.4.4.tar.gz", hash = "sha256:5fe3210c577463545191d45ad522d3d5e78d55218ce97215e82004dcae1e1234"},
+]
+
+[package.dependencies]
+filelock = ">=3.0.8"
+idna = "*"
+requests = ">=2.1.0"
+requests-file = ">=1.4"
+
 [[package]]
 name = "tokenizers"
 version = "0.13.3"
@ -11950,7 +12061,7 @@ files = [
 ]

 [package.dependencies]
-accelerate = {version = ">=0.20.2", optional = true, markers = "extra == \"accelerate\""}
+accelerate = {version = ">=0.20.2", optional = true, markers = "extra == \"accelerate\" or extra == \"torch\""}
 filelock = "*"
 huggingface-hub = ">=0.14.1,<1.0"
 numpy = ">=1.17"
@ -13422,15 +13533,15 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
 cffi = ["cffi (>=1.11)"]

 [extras]
-all = ["anthropic", "clarifai", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "marqo", "pymongo", "weaviate-client", "redis", "google-api-python-client", "google-auth", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "libdeeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "langkit", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "steamship", "pdfminer-six", "lxml", "requests-toolbelt", "neo4j", "openlm", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "momento", "singlestoredb", "tigrisdb", "nebula3-python", "awadb", "esprima", "octoai-sdk", "rdflib", "amadeus", "xinference", "librosa", "python-arango"]
-azure = ["azure-identity", "azure-cosmos", "openai", "azure-core", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-search-documents"]
+all = ["O365", "aleph-alpha-client", "amadeus", "anthropic", "arxiv", "atlassian-python-api", "awadb", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clarifai", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "esprima", "faiss-cpu", "google-api-python-client", "google-auth", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "langkit", "lark", "libdeeplake", "librosa", "lxml", "manifest-ml", "marqo", "momento", "nebula3-python", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "octoai-sdk", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pymongo", "pyowm", "pypdf", "pytesseract", "python-arango", "pyvespa", "qdrant-client", "rdflib", "redis", "requests-toolbelt", "sentence-transformers", "singlestoredb", "spacy", "steamship", "tensorflow-text", "tigrisdb", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha", "xinference"]
+azure = ["azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-core", "azure-cosmos", "azure-identity", "azure-search-documents", "openai"]
 clarifai = ["clarifai"]
 cohere = ["cohere"]
 docarray = ["docarray"]
 embeddings = ["sentence-transformers"]
-extended-testing = ["beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "jq", "pdfminer-six", "pgvector", "pypdf", "pymupdf", "pypdfium2", "tqdm", "lxml", "atlassian-python-api", "mwparserfromhell", "mwxml", "pandas", "telethon", "psychicapi", "zep-python", "gql", "requests-toolbelt", "html2text", "py-trello", "scikit-learn", "streamlit", "pyspark", "openai", "sympy", "rapidfuzz", "openai", "rank-bm25", "geopandas", "jinja2", "xinference", "gitpython"]
+extended-testing = ["atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"]
 javascript = ["esprima"]
-llms = ["anthropic", "clarifai", "cohere", "openai", "openllm", "openlm", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers", "xinference"]
+llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers", "xinference"]
 openai = ["openai", "tiktoken"]
 qdrant = ["qdrant-client"]
 text-helpers = ["chardet"]
@ -13438,4 +13549,4 @@ text-helpers = ["chardet"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.8.1,<4.0"
-content-hash = "84ededcf21a742653863c033dd31e1b24af7562d479c179cd58ba22b2a9805e9"
+content-hash = "0708c3b45f59eea36919ff9ff99fa6eddc81bccb654cce183641ef8396ea5290"
--- a/libs/langchain/pyproject.toml
+++ b/libs/langchain/pyproject.toml
@ -128,6 +128,8 @@ xinference = {version = "^0.0.6", optional = true}
 python-arango = {version = "^7.5.9", optional = true}
 gitpython = {version = "^3.1.32", optional = true}
 librosa = {version="^0.10.0.post2", optional = true }
+feedparser = {version = "^6.0.10", optional = true}
+newspaper3k = {version = "^0.2.8", optional = true}

 [tool.poetry.group.test.dependencies]
 # The only dependencies that should be added are
@ -363,6 +365,8 @@ extended_testing = [
 "jinja2",
 "xinference",
 "gitpython",
+ "newspaper3k",
+ "feedparser",
 ]

 [tool.ruff]
--- a/libs/langchain/tests/integration_tests/document_loaders/test_rss.py
+++ b/libs/langchain/tests/integration_tests/document_loaders/test_rss.py
@ -0,0 +1,42 @@
+from pathlib import Path
+
+from langchain.document_loaders.rss import RSSFeedLoader
+
+
+def test_rss_loader() -> None:
+    loader = RSSFeedLoader(urls=["https://www.engadget.com/rss.xml"])
+    docs = loader.load()
+
+    assert docs[0] is not None
+    assert hasattr(docs[0], "page_content")
+    assert hasattr(docs[0], "metadata")
+
+    metadata = docs[0].metadata
+    assert "feed" in metadata
+    assert "title" in metadata
+    assert "link" in metadata
+    assert "authors" in metadata
+    assert "language" in metadata
+    assert "description" in metadata
+    assert "publish_date" in metadata
+
+
+def test_rss_loader_with_opml() -> None:
+    file_path = Path(__file__).parent.parent / "examples"
+    with open(file_path.joinpath("sample_rss_feeds.opml"), "r") as f:
+        loader = RSSFeedLoader(opml=f.read())
+
+    docs = loader.load()
+
+    assert docs[0] is not None
+    assert hasattr(docs[0], "page_content")
+    assert hasattr(docs[0], "metadata")
+
+    metadata = docs[0].metadata
+    assert "feed" in metadata
+    assert "title" in metadata
+    assert "link" in metadata
+    assert "authors" in metadata
+    assert "language" in metadata
+    assert "description" in metadata
+    assert "publish_date" in metadata
--- a/libs/langchain/tests/integration_tests/examples/sample_rss_feeds.opml
+++ b/libs/langchain/tests/integration_tests/examples/sample_rss_feeds.opml
@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<opml version="1.0">
+    <head>
+        <title>Sample RSS feed subscriptions</title>
+    </head>
+    <body>
+        <outline text="Tech" title="Tech">
+            <outline type="rss" text="Engadget" title="Engadget" xmlUrl="http://www.engadget.com/rss-full.xml" htmlUrl="http://www.engadget.com"/>
+            <outline type="rss" text="Ars Technica - All content" title="Ars Technica - All content" xmlUrl="http://feeds.arstechnica.com/arstechnica/index/" htmlUrl="https://arstechnica.com"/>
+        </outline>
+    </body>
+</opml>
--- a/libs/langchain/tests/unit_tests/document_loaders/test_rss.py
+++ b/libs/langchain/tests/unit_tests/document_loaders/test_rss.py
@ -0,0 +1,18 @@
+import pytest
+
+from langchain.document_loaders import RSSFeedLoader
+
+
+@pytest.mark.requires("feedparser", "newspaper")
+def test_continue_on_failure_true() -> None:
+    """Test exception is not raised when continue_on_failure=True."""
+    loader = RSSFeedLoader(["badurl.foobar"])
+    loader.load()
+
+
+@pytest.mark.requires("feedparser", "newspaper")
+def test_continue_on_failure_false() -> None:
+    """Test exception is raised when continue_on_failure=False."""
+    loader = RSSFeedLoader(["badurl.foobar"], continue_on_failure=False)
+    with pytest.raises(Exception):
+        loader.load()
--- a/poetry.lock
+++ b/poetry.lock