mirror of
https://github.com/hwchase17/langchain
synced 2024-10-31 15:20:26 +00:00
8ef7e14a85
Replace this comment with: - Description: added a document loader for a list of RSS feeds or OPML. It iterates through the list and uses NewsURLLoader to load each article. - Issue: N/A - Dependencies: feedparser, listparser - Tag maintainer: @rlancemartin, @eyurtsev - Twitter handle: @ruze --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
171 lines
3.3 KiB
Plaintext
171 lines
3.3 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "2dfc4698",
|
|
"metadata": {},
|
|
"source": [
|
|
"# RSS Feeds\n",
|
|
"\n",
|
|
"This covers how to load HTML news articles from a list of RSS feed URLs into a document format that we can use downstream."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "16c3699e",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from langchain.document_loaders import RSSFeedLoader"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "836fbac1",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"urls = [\"https://www.engadget.com/rss.xml\"]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "33089aba-ff74-4d00-8f40-9449c29587cc",
|
|
"metadata": {},
|
|
"source": [
|
|
"Pass in urls to load them into Documents"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "00f46fda",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"loader = RSSFeedLoader(urls=urls)\n",
|
|
"data = loader.load()\n",
|
|
"print(len(data))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"data[0]"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"id": "b447468cc42266d0"
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"source": [
|
|
"You can pass arguments to the NewsURLLoader which it uses to load articles."
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"id": "c36d3b0d329faf2a"
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"loader = RSSFeedLoader(urls=urls, nlp=True)\n",
|
|
"data = loader.load()\n",
|
|
"print(len(data))"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"id": "5fdada62470d3019"
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"data[0].metadata['keywords']"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"id": "11d71963f7735c1d"
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"data[0].metadata['summary']"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"id": "9fb64ba0e8780966"
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"source": [
|
|
"You can also use an OPML file such as a Feedly export. Pass in either a URL or the OPML contents."
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"id": "98ac26c488315bff"
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "8b6f07ae526a897c",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"with open(\"example_data/sample_rss_feeds.opml\", \"r\") as f:\n",
|
|
" loader = RSSFeedLoader(opml=f.read())\n",
|
|
"data = loader.load()\n",
|
|
"print(len(data))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"outputs": [],
|
|
"source": [
|
|
"data[0]"
|
|
],
|
|
"metadata": {
|
|
"collapsed": false
|
|
},
|
|
"id": "b68a26b3"
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.10.6"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|