RSS Feed / OPML loader (#8694)

Replace this comment with:
- Description: added a document loader for a list of RSS feeds or OPML.
It iterates through the list and uses NewsURLLoader to load each
article.
  - Issue: N/A
  - Dependencies: feedparser, listparser
  - Tag maintainer: @rlancemartin, @eyurtsev
  - Twitter handle: @ruze

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
pull/8727/head
ruze 10 months ago committed by GitHub
parent 53e4148a1b
commit 8ef7e14a85
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<opml version="1.0">
<head>
<title>Sample RSS feed subscriptions</title>
</head>
<body>
<outline text="Tech" title="Tech">
<outline type="rss" text="Engadget" title="Engadget" xmlUrl="http://www.engadget.com/rss-full.xml" htmlUrl="http://www.engadget.com"/>
<outline type="rss" text="Ars Technica - All content" title="Ars Technica - All content" xmlUrl="http://feeds.arstechnica.com/arstechnica/index/" htmlUrl="https://arstechnica.com"/>
</outline>
</body>
</opml>

@ -0,0 +1,170 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "2dfc4698",
"metadata": {},
"source": [
"# RSS Feeds\n",
"\n",
"This covers how to load HTML news articles from a list of RSS feed URLs into a document format that we can use downstream."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "16c3699e",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import RSSFeedLoader"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "836fbac1",
"metadata": {},
"outputs": [],
"source": [
"urls = [\"https://www.engadget.com/rss.xml\"]"
]
},
{
"cell_type": "markdown",
"id": "33089aba-ff74-4d00-8f40-9449c29587cc",
"metadata": {},
"source": [
"Pass in urls to load them into Documents"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "00f46fda",
"metadata": {},
"outputs": [],
"source": [
"loader = RSSFeedLoader(urls=urls)\n",
"data = loader.load()\n",
"print(len(data))"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"data[0]"
],
"metadata": {
"collapsed": false
},
"id": "b447468cc42266d0"
},
{
"cell_type": "markdown",
"source": [
"You can pass arguments to the NewsURLLoader which it uses to load articles."
],
"metadata": {
"collapsed": false
},
"id": "c36d3b0d329faf2a"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"loader = RSSFeedLoader(urls=urls, nlp=True)\n",
"data = loader.load()\n",
"print(len(data))"
],
"metadata": {
"collapsed": false
},
"id": "5fdada62470d3019"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"data[0].metadata['keywords']"
],
"metadata": {
"collapsed": false
},
"id": "11d71963f7735c1d"
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"data[0].metadata['summary']"
],
"metadata": {
"collapsed": false
},
"id": "9fb64ba0e8780966"
},
{
"cell_type": "markdown",
"source": [
"You can also use an OPML file such as a Feedly export. Pass in either a URL or the OPML contents."
],
"metadata": {
"collapsed": false
},
"id": "98ac26c488315bff"
},
{
"cell_type": "code",
"execution_count": null,
"id": "8b6f07ae526a897c",
"metadata": {},
"outputs": [],
"source": [
"with open(\"example_data/sample_rss_feeds.opml\", \"r\") as f:\n",
" loader = RSSFeedLoader(opml=f.read())\n",
"data = loader.load()\n",
"print(len(data))"
]
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"data[0]"
],
"metadata": {
"collapsed": false
},
"id": "b68a26b3"
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

@ -128,6 +128,7 @@ from langchain.document_loaders.recursive_url_loader import RecursiveUrlLoader
from langchain.document_loaders.reddit import RedditPostsLoader
from langchain.document_loaders.roam import RoamLoader
from langchain.document_loaders.rocksetdb import RocksetLoader
from langchain.document_loaders.rss import RSSFeedLoader
from langchain.document_loaders.rst import UnstructuredRSTLoader
from langchain.document_loaders.rtf import UnstructuredRTFLoader
from langchain.document_loaders.s3_directory import S3DirectoryLoader
@ -280,6 +281,7 @@ __all__ = [
"RedditPostsLoader",
"RoamLoader",
"RocksetLoader",
"RSSFeedLoader",
"S3DirectoryLoader",
"S3FileLoader",
"SRTLoader",

@ -0,0 +1,133 @@
"""Loader that uses unstructured to load HTML files."""
import logging
from typing import Any, Iterator, List, Optional, Sequence
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from langchain.document_loaders.news import NewsURLLoader
logger = logging.getLogger(__name__)
class RSSFeedLoader(BaseLoader):
"""Loader that uses newspaper to load news articles from RSS feeds.
Args:
urls: URLs for RSS feeds to load. Each articles in the feed is loaded into its own document.
opml: OPML file to load feed urls from. Only one of urls or opml should be provided. The value
can be a URL string, or OPML markup contents as byte or string.
continue_on_failure: If True, continue loading documents even if
loading fails for a particular URL.
show_progress_bar: If True, use tqdm to show a loading progress bar. Requires
tqdm to be installed, ``pip install tqdm``.
**newsloader_kwargs: Any additional named arguments to pass to
NewsURLLoader.
Example:
.. code-block:: python
from langchain.document_loaders import RSSFeedLoader
loader = RSSFeedLoader(
urls=["<url-1>", "<url-2>"],
)
docs = loader.load()
The loader uses feedparser to parse RSS feeds. The feedparser library is not installed by default so you should
install it if using this loader:
https://pythonhosted.org/feedparser/
If you use OPML, you should also install listparser:
https://pythonhosted.org/listparser/
Finally, newspaper is used to process each article:
https://newspaper.readthedocs.io/en/latest/
""" # noqa: E501
def __init__(
self,
urls: Optional[Sequence[str]] = None,
opml: Optional[str] = None,
continue_on_failure: bool = True,
show_progress_bar: bool = False,
**newsloader_kwargs: Any,
) -> None:
"""Initialize with urls or OPML."""
if (urls is None) == (
opml is None
): # This is True if both are None or neither is None
raise ValueError(
"Provide either the urls or the opml argument, but not both."
)
self.urls = urls
self.opml = opml
self.continue_on_failure = continue_on_failure
self.show_progress_bar = show_progress_bar
self.newsloader_kwargs = newsloader_kwargs
def load(self) -> List[Document]:
iter = self.lazy_load()
if self.show_progress_bar:
try:
from tqdm import tqdm
except ImportError as e:
raise ImportError(
"Package tqdm must be installed if show_progress_bar=True. "
"Please install with 'pip install tqdm' or set "
"show_progress_bar=False."
) from e
iter = tqdm(iter)
return list(iter)
@property
def _get_urls(self) -> Sequence[str]:
if self.urls:
return self.urls
try:
import listparser
except ImportError as e:
raise ImportError(
"Package listparser must be installed if the opml arg is used. "
"Please install with 'pip install listparser' or use the "
"urls arg instead."
) from e
rss = listparser.parse(self.opml)
return [feed.url for feed in rss.feeds]
def lazy_load(self) -> Iterator[Document]:
try:
import feedparser # noqa:F401
except ImportError:
raise ImportError(
"feedparser package not found, please install it with "
"`pip install feedparser`"
)
for url in self._get_urls:
try:
feed = feedparser.parse(url)
if getattr(feed, "bozo", False):
raise ValueError(
f"Error fetching {url}, exception: {feed.bozo_exception}"
)
except Exception as e:
if self.continue_on_failure:
logger.error(f"Error fetching {url}, exception: {e}")
continue
else:
raise e
try:
for entry in feed.entries:
loader = NewsURLLoader(
urls=[entry.link],
**self.newsloader_kwargs,
)
article = loader.load()[0]
article.metadata["feed"] = url
yield article
except Exception as e:
if self.continue_on_failure:
logger.error(f"Error processing entry {entry.link}, exception: {e}")
continue
else:
raise e

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry and should not be changed by hand.
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
[[package]]
name = "absl-py"
@ -2158,6 +2158,18 @@ ssh = ["bcrypt (>=3.1.5)"]
test = ["pretend", "pytest (>=6.2.0)", "pytest-benchmark", "pytest-cov", "pytest-xdist"]
test-randomorder = ["pytest-randomly"]
[[package]]
name = "cssselect"
version = "1.2.0"
description = "cssselect parses CSS3 Selectors and translates them to XPath 1.0"
category = "main"
optional = true
python-versions = ">=3.7"
files = [
{file = "cssselect-1.2.0-py2.py3-none-any.whl", hash = "sha256:da1885f0c10b60c03ed5eccbb6b68d6eff248d91976fcde348f395d54c9fd35e"},
{file = "cssselect-1.2.0.tar.gz", hash = "sha256:666b19839cfaddb9ce9d36bfe4c969132c647b92fc9088c4e23f786b30f1b3dc"},
]
[[package]]
name = "cycler"
version = "0.11.0"
@ -2838,6 +2850,22 @@ files = [
[package.extras]
devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"]
[[package]]
name = "feedfinder2"
version = "0.0.4"
description = "Find the feed URLs for a website."
category = "main"
optional = true
python-versions = "*"
files = [
{file = "feedfinder2-0.0.4.tar.gz", hash = "sha256:3701ee01a6c85f8b865a049c30ba0b4608858c803fe8e30d1d289fdbe89d0efe"},
]
[package.dependencies]
beautifulsoup4 = "*"
requests = "*"
six = "*"
[[package]]
name = "feedparser"
version = "6.0.10"
@ -4378,6 +4406,17 @@ docs = ["Jinja2 (==2.11.3)", "MarkupSafe (==1.1.1)", "Pygments (==2.8.1)", "alab
qa = ["flake8 (==3.8.3)", "mypy (==0.782)"]
testing = ["Django (<3.1)", "attrs", "colorama", "docopt", "pytest (<7.0.0)"]
[[package]]
name = "jieba3k"
version = "0.35.1"
description = "Chinese Words Segementation Utilities"
category = "main"
optional = true
python-versions = "*"
files = [
{file = "jieba3k-0.35.1.zip", hash = "sha256:980a4f2636b778d312518066be90c7697d410dd5a472385f5afced71a2db1c10"},
]
[[package]]
name = "jina"
version = "3.14.1"
@ -6472,6 +6511,33 @@ doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.2)", "pydata-sphinx-
extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.9)", "sympy (>=1.10)"]
test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"]
[[package]]
name = "newspaper3k"
version = "0.2.8"
description = "Simplified python article discovery & extraction."
category = "main"
optional = true
python-versions = "*"
files = [
{file = "newspaper3k-0.2.8-py3-none-any.whl", hash = "sha256:44a864222633d3081113d1030615991c3dbba87239f6bbf59d91240f71a22e3e"},
{file = "newspaper3k-0.2.8.tar.gz", hash = "sha256:9f1bd3e1fb48f400c715abf875cc7b0a67b7ddcd87f50c9aeeb8fcbbbd9004fb"},
]
[package.dependencies]
beautifulsoup4 = ">=4.4.1"
cssselect = ">=0.9.2"
feedfinder2 = ">=0.0.4"
feedparser = ">=5.2.1"
jieba3k = ">=0.35.1"
lxml = ">=3.6.0"
nltk = ">=3.2.1"
Pillow = ">=3.3.0"
python-dateutil = ">=2.5.3"
PyYAML = ">=3.11"
requests = ">=2.10.0"
tinysegmenter = "0.3"
tldextract = ">=2.0.1"
[[package]]
name = "nlpcloud"
version = "1.0.42"
@ -10001,6 +10067,22 @@ urllib3 = ">=1.21.1,<1.27"
socks = ["PySocks (>=1.5.6,!=1.5.7)"]
use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
[[package]]
name = "requests-file"
version = "1.5.1"
description = "File transport adapter for Requests"
category = "main"
optional = true
python-versions = "*"
files = [
{file = "requests-file-1.5.1.tar.gz", hash = "sha256:07d74208d3389d01c38ab89ef403af0cfec63957d53a0081d8eca738d0247d8e"},
{file = "requests_file-1.5.1-py2.py3-none-any.whl", hash = "sha256:dfe5dae75c12481f68ba353183c53a65e6044c923e64c24b2209f6c7570ca953"},
]
[package.dependencies]
requests = ">=1.0.0"
six = "*"
[[package]]
name = "requests-oauthlib"
version = "1.3.1"
@ -11708,6 +11790,35 @@ webencodings = ">=0.4"
doc = ["sphinx", "sphinx_rtd_theme"]
test = ["flake8", "isort", "pytest"]
[[package]]
name = "tinysegmenter"
version = "0.3"
description = "Very compact Japanese tokenizer"
category = "main"
optional = true
python-versions = "*"
files = [
{file = "tinysegmenter-0.3.tar.gz", hash = "sha256:ed1f6d2e806a4758a73be589754384cbadadc7e1a414c81a166fc9adf2d40c6d"},
]
[[package]]
name = "tldextract"
version = "3.4.4"
description = "Accurately separates a URL's subdomain, domain, and public suffix, using the Public Suffix List (PSL). By default, this includes the public ICANN TLDs and their exceptions. You can optionally support the Public Suffix List's private domains as well."
category = "main"
optional = true
python-versions = ">=3.7"
files = [
{file = "tldextract-3.4.4-py3-none-any.whl", hash = "sha256:581e7dbefc90e7bb857bb6f768d25c811a3c5f0892ed56a9a2999ddb7b1b70c2"},
{file = "tldextract-3.4.4.tar.gz", hash = "sha256:5fe3210c577463545191d45ad522d3d5e78d55218ce97215e82004dcae1e1234"},
]
[package.dependencies]
filelock = ">=3.0.8"
idna = "*"
requests = ">=2.1.0"
requests-file = ">=1.4"
[[package]]
name = "tokenizers"
version = "0.13.3"
@ -11950,7 +12061,7 @@ files = [
]
[package.dependencies]
accelerate = {version = ">=0.20.2", optional = true, markers = "extra == \"accelerate\""}
accelerate = {version = ">=0.20.2", optional = true, markers = "extra == \"accelerate\" or extra == \"torch\""}
filelock = "*"
huggingface-hub = ">=0.14.1,<1.0"
numpy = ">=1.17"
@ -13422,15 +13533,15 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
cffi = ["cffi (>=1.11)"]
[extras]
all = ["anthropic", "clarifai", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "marqo", "pymongo", "weaviate-client", "redis", "google-api-python-client", "google-auth", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "libdeeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "langkit", "lark", "pexpect", "pyvespa", "O365", "jq", "docarray", "steamship", "pdfminer-six", "lxml", "requests-toolbelt", "neo4j", "openlm", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "momento", "singlestoredb", "tigrisdb", "nebula3-python", "awadb", "esprima", "octoai-sdk", "rdflib", "amadeus", "xinference", "librosa", "python-arango"]
azure = ["azure-identity", "azure-cosmos", "openai", "azure-core", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-search-documents"]
all = ["O365", "aleph-alpha-client", "amadeus", "anthropic", "arxiv", "atlassian-python-api", "awadb", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clarifai", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "esprima", "faiss-cpu", "google-api-python-client", "google-auth", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "langkit", "lark", "libdeeplake", "librosa", "lxml", "manifest-ml", "marqo", "momento", "nebula3-python", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "octoai-sdk", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pymongo", "pyowm", "pypdf", "pytesseract", "python-arango", "pyvespa", "qdrant-client", "rdflib", "redis", "requests-toolbelt", "sentence-transformers", "singlestoredb", "spacy", "steamship", "tensorflow-text", "tigrisdb", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha", "xinference"]
azure = ["azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-core", "azure-cosmos", "azure-identity", "azure-search-documents", "openai"]
clarifai = ["clarifai"]
cohere = ["cohere"]
docarray = ["docarray"]
embeddings = ["sentence-transformers"]
extended-testing = ["beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "jq", "pdfminer-six", "pgvector", "pypdf", "pymupdf", "pypdfium2", "tqdm", "lxml", "atlassian-python-api", "mwparserfromhell", "mwxml", "pandas", "telethon", "psychicapi", "zep-python", "gql", "requests-toolbelt", "html2text", "py-trello", "scikit-learn", "streamlit", "pyspark", "openai", "sympy", "rapidfuzz", "openai", "rank-bm25", "geopandas", "jinja2", "xinference", "gitpython"]
extended-testing = ["atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "esprima", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "streamlit", "sympy", "telethon", "tqdm", "xinference", "zep-python"]
javascript = ["esprima"]
llms = ["anthropic", "clarifai", "cohere", "openai", "openllm", "openlm", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers", "xinference"]
llms = ["anthropic", "clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openllm", "openlm", "torch", "transformers", "xinference"]
openai = ["openai", "tiktoken"]
qdrant = ["qdrant-client"]
text-helpers = ["chardet"]
@ -13438,4 +13549,4 @@ text-helpers = ["chardet"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "84ededcf21a742653863c033dd31e1b24af7562d479c179cd58ba22b2a9805e9"
content-hash = "0708c3b45f59eea36919ff9ff99fa6eddc81bccb654cce183641ef8396ea5290"

@ -128,6 +128,8 @@ xinference = {version = "^0.0.6", optional = true}
python-arango = {version = "^7.5.9", optional = true}
gitpython = {version = "^3.1.32", optional = true}
librosa = {version="^0.10.0.post2", optional = true }
feedparser = {version = "^6.0.10", optional = true}
newspaper3k = {version = "^0.2.8", optional = true}
[tool.poetry.group.test.dependencies]
# The only dependencies that should be added are
@ -363,6 +365,8 @@ extended_testing = [
"jinja2",
"xinference",
"gitpython",
"newspaper3k",
"feedparser",
]
[tool.ruff]

@ -0,0 +1,42 @@
from pathlib import Path
from langchain.document_loaders.rss import RSSFeedLoader
def test_rss_loader() -> None:
loader = RSSFeedLoader(urls=["https://www.engadget.com/rss.xml"])
docs = loader.load()
assert docs[0] is not None
assert hasattr(docs[0], "page_content")
assert hasattr(docs[0], "metadata")
metadata = docs[0].metadata
assert "feed" in metadata
assert "title" in metadata
assert "link" in metadata
assert "authors" in metadata
assert "language" in metadata
assert "description" in metadata
assert "publish_date" in metadata
def test_rss_loader_with_opml() -> None:
file_path = Path(__file__).parent.parent / "examples"
with open(file_path.joinpath("sample_rss_feeds.opml"), "r") as f:
loader = RSSFeedLoader(opml=f.read())
docs = loader.load()
assert docs[0] is not None
assert hasattr(docs[0], "page_content")
assert hasattr(docs[0], "metadata")
metadata = docs[0].metadata
assert "feed" in metadata
assert "title" in metadata
assert "link" in metadata
assert "authors" in metadata
assert "language" in metadata
assert "description" in metadata
assert "publish_date" in metadata

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<opml version="1.0">
<head>
<title>Sample RSS feed subscriptions</title>
</head>
<body>
<outline text="Tech" title="Tech">
<outline type="rss" text="Engadget" title="Engadget" xmlUrl="http://www.engadget.com/rss-full.xml" htmlUrl="http://www.engadget.com"/>
<outline type="rss" text="Ars Technica - All content" title="Ars Technica - All content" xmlUrl="http://feeds.arstechnica.com/arstechnica/index/" htmlUrl="https://arstechnica.com"/>
</outline>
</body>
</opml>

@ -0,0 +1,18 @@
import pytest
from langchain.document_loaders import RSSFeedLoader
@pytest.mark.requires("feedparser", "newspaper")
def test_continue_on_failure_true() -> None:
"""Test exception is not raised when continue_on_failure=True."""
loader = RSSFeedLoader(["badurl.foobar"])
loader.load()
@pytest.mark.requires("feedparser", "newspaper")
def test_continue_on_failure_false() -> None:
"""Test exception is raised when continue_on_failure=False."""
loader = RSSFeedLoader(["badurl.foobar"], continue_on_failure=False)
with pytest.raises(Exception):
loader.load()

199
poetry.lock generated

File diff suppressed because it is too large Load Diff
Loading…
Cancel
Save