Add Twitter Tweet Loader (#3050)

Reformatted version of #3022

---------

Co-authored-by: LiaoKong <568250549@qq.com>
This commit is contained in:
Zander Chase 2023-04-17 21:44:54 -07:00 committed by GitHub
parent 2984ad3964
commit 93c0514105
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 225 additions and 0 deletions

View File

@ -0,0 +1,114 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "66a7777e",
"metadata": {},
"source": [
"# Twitter\n",
"\n",
"This loader fetches the text from the Tweets of a list of Twitter users, using the `tweepy` Python package.\n",
"You must initialize the loader with your Twitter API token, and you need to pass in the Twitter username you want to extract."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "9ec8a3b3",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import TwitterTweetLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "43128d8d",
"metadata": {},
"outputs": [],
"source": [
"#!pip install tweepy"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "35d6809a",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"loader = TwitterTweetLoader.from_bearer_token(\n",
" oauth2_bearer_token=\"YOUR BEARER TOKEN\",\n",
" twitter_users=['elonmusk'],\n",
" number_tweets=50, # Default value is 100\n",
")\n",
"\n",
"# Or load from access token and consumer keys\n",
"# loader = TwitterTweetLoader.from_secrets(\n",
"# access_token='YOUR ACCESS TOKEN',\n",
"# access_token_secret='YOUR ACCESS TOKEN SECRET',\n",
"# consumer_key='YOUR CONSUMER KEY',\n",
"# consumer_secret='YOUR CONSUMER SECRET',\n",
"# twitter_users=['elonmusk'],\n",
"# number_tweets=50,\n",
"# )"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='@MrAndyNgo @REI One store after another shutting down', metadata={'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'user_info': {'id': 44196397, 'id_str': '44196397', 'name': 'Elon Musk', 'screen_name': 'elonmusk', 'location': 'A Shortfall of Gravitas', 'profile_location': None, 'description': 'nothing', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 135528327, 'friends_count': 220, 'listed_count': 120478, 'created_at': 'Tue Jun 02 20:12:29 +0000 2009', 'favourites_count': 21285, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 24795, 'lang': None, 'status': {'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'id': 1648170947541704705, 'id_str': '1648170947541704705', 'text': '@MrAndyNgo @REI One store after another shutting down', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'MrAndyNgo', 'name': 'Andy Ngô 🏳️\\u200d🌈', 'id': 2835451658, 'id_str': '2835451658', 'indices': [0, 10]}, {'screen_name': 'REI', 'name': 'REI', 'id': 16583846, 'id_str': '16583846', 'indices': [11, 15]}], 'urls': []}, 'source': '<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>', 'in_reply_to_status_id': 1648134341678051328, 'in_reply_to_status_id_str': '1648134341678051328', 'in_reply_to_user_id': 2835451658, 'in_reply_to_user_id_str': '2835451658', 'in_reply_to_screen_name': 'MrAndyNgo', 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 118, 'favorite_count': 1286, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/44196397/1576183471', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None, 'translator_type': 'none', 'withheld_in_countries': []}}),\n",
" Document(page_content='@KanekoaTheGreat @joshrogin @glennbeck Large ships are fundamentally vulnerable to ballistic (hypersonic) missiles', metadata={'created_at': 'Tue Apr 18 03:43:25 +0000 2023', 'user_info': {'id': 44196397, 'id_str': '44196397', 'name': 'Elon Musk', 'screen_name': 'elonmusk', 'location': 'A Shortfall of Gravitas', 'profile_location': None, 'description': 'nothing', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 135528327, 'friends_count': 220, 'listed_count': 120478, 'created_at': 'Tue Jun 02 20:12:29 +0000 2009', 'favourites_count': 21285, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 24795, 'lang': None, 'status': {'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'id': 1648170947541704705, 'id_str': '1648170947541704705', 'text': '@MrAndyNgo @REI One store after another shutting down', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'MrAndyNgo', 'name': 'Andy Ngô 🏳️\\u200d🌈', 'id': 2835451658, 'id_str': '2835451658', 'indices': [0, 10]}, {'screen_name': 'REI', 'name': 'REI', 'id': 16583846, 'id_str': '16583846', 'indices': [11, 15]}], 'urls': []}, 'source': '<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>', 'in_reply_to_status_id': 1648134341678051328, 'in_reply_to_status_id_str': '1648134341678051328', 'in_reply_to_user_id': 2835451658, 'in_reply_to_user_id_str': '2835451658', 'in_reply_to_screen_name': 'MrAndyNgo', 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 118, 'favorite_count': 1286, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/44196397/1576183471', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None, 'translator_type': 'none', 'withheld_in_countries': []}}),\n",
" Document(page_content='@KanekoaTheGreat The Golden Rule', metadata={'created_at': 'Tue Apr 18 03:37:17 +0000 2023', 'user_info': {'id': 44196397, 'id_str': '44196397', 'name': 'Elon Musk', 'screen_name': 'elonmusk', 'location': 'A Shortfall of Gravitas', 'profile_location': None, 'description': 'nothing', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 135528327, 'friends_count': 220, 'listed_count': 120478, 'created_at': 'Tue Jun 02 20:12:29 +0000 2009', 'favourites_count': 21285, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 24795, 'lang': None, 'status': {'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'id': 1648170947541704705, 'id_str': '1648170947541704705', 'text': '@MrAndyNgo @REI One store after another shutting down', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'MrAndyNgo', 'name': 'Andy Ngô 🏳️\\u200d🌈', 'id': 2835451658, 'id_str': '2835451658', 'indices': [0, 10]}, {'screen_name': 'REI', 'name': 'REI', 'id': 16583846, 'id_str': '16583846', 'indices': [11, 15]}], 'urls': []}, 'source': '<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>', 'in_reply_to_status_id': 1648134341678051328, 'in_reply_to_status_id_str': '1648134341678051328', 'in_reply_to_user_id': 2835451658, 'in_reply_to_user_id_str': '2835451658', 'in_reply_to_screen_name': 'MrAndyNgo', 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 118, 'favorite_count': 1286, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/44196397/1576183471', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None, 'translator_type': 'none', 'withheld_in_countries': []}}),\n",
" Document(page_content='@KanekoaTheGreat 🧐', metadata={'created_at': 'Tue Apr 18 03:35:48 +0000 2023', 'user_info': {'id': 44196397, 'id_str': '44196397', 'name': 'Elon Musk', 'screen_name': 'elonmusk', 'location': 'A Shortfall of Gravitas', 'profile_location': None, 'description': 'nothing', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 135528327, 'friends_count': 220, 'listed_count': 120478, 'created_at': 'Tue Jun 02 20:12:29 +0000 2009', 'favourites_count': 21285, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 24795, 'lang': None, 'status': {'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'id': 1648170947541704705, 'id_str': '1648170947541704705', 'text': '@MrAndyNgo @REI One store after another shutting down', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'MrAndyNgo', 'name': 'Andy Ngô 🏳️\\u200d🌈', 'id': 2835451658, 'id_str': '2835451658', 'indices': [0, 10]}, {'screen_name': 'REI', 'name': 'REI', 'id': 16583846, 'id_str': '16583846', 'indices': [11, 15]}], 'urls': []}, 'source': '<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>', 'in_reply_to_status_id': 1648134341678051328, 'in_reply_to_status_id_str': '1648134341678051328', 'in_reply_to_user_id': 2835451658, 'in_reply_to_user_id_str': '2835451658', 'in_reply_to_screen_name': 'MrAndyNgo', 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 118, 'favorite_count': 1286, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/44196397/1576183471', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None, 'translator_type': 'none', 'withheld_in_countries': []}}),\n",
" Document(page_content='@TRHLofficial Whats he talking about and why is it sponsored by Eriks son?', metadata={'created_at': 'Tue Apr 18 03:32:17 +0000 2023', 'user_info': {'id': 44196397, 'id_str': '44196397', 'name': 'Elon Musk', 'screen_name': 'elonmusk', 'location': 'A Shortfall of Gravitas', 'profile_location': None, 'description': 'nothing', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 135528327, 'friends_count': 220, 'listed_count': 120478, 'created_at': 'Tue Jun 02 20:12:29 +0000 2009', 'favourites_count': 21285, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 24795, 'lang': None, 'status': {'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'id': 1648170947541704705, 'id_str': '1648170947541704705', 'text': '@MrAndyNgo @REI One store after another shutting down', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'MrAndyNgo', 'name': 'Andy Ngô 🏳️\\u200d🌈', 'id': 2835451658, 'id_str': '2835451658', 'indices': [0, 10]}, {'screen_name': 'REI', 'name': 'REI', 'id': 16583846, 'id_str': '16583846', 'indices': [11, 15]}], 'urls': []}, 'source': '<a href=\"http://twitter.com/download/iphone\" rel=\"nofollow\">Twitter for iPhone</a>', 'in_reply_to_status_id': 1648134341678051328, 'in_reply_to_status_id_str': '1648134341678051328', 'in_reply_to_user_id': 2835451658, 'in_reply_to_user_id_str': '2835451658', 'in_reply_to_screen_name': 'MrAndyNgo', 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 118, 'favorite_count': 1286, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/44196397/1576183471', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None, 'translator_type': 'none', 'withheld_in_countries': []}})]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"documents = loader.load()\n",
"documents[:5]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -62,6 +62,7 @@ from langchain.document_loaders.slack_directory import SlackDirectoryLoader
from langchain.document_loaders.srt import SRTLoader from langchain.document_loaders.srt import SRTLoader
from langchain.document_loaders.telegram import TelegramChatLoader from langchain.document_loaders.telegram import TelegramChatLoader
from langchain.document_loaders.text import TextLoader from langchain.document_loaders.text import TextLoader
from langchain.document_loaders.twitter import TwitterTweetLoader
from langchain.document_loaders.unstructured import ( from langchain.document_loaders.unstructured import (
UnstructuredFileIOLoader, UnstructuredFileIOLoader,
UnstructuredFileLoader, UnstructuredFileLoader,
@ -147,5 +148,6 @@ __all__ = [
"BiliBiliLoader", "BiliBiliLoader",
"SlackDirectoryLoader", "SlackDirectoryLoader",
"GitLoader", "GitLoader",
"TwitterTweetLoader",
"ImageCaptionLoader", "ImageCaptionLoader",
] ]

View File

@ -0,0 +1,109 @@
"""Twitter document loader."""
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
if TYPE_CHECKING:
import tweepy
from tweepy import OAuth2BearerHandler, OAuthHandler
def _dependable_tweepy_import() -> tweepy:
try:
import tweepy
except ImportError:
raise ValueError(
"tweepy package not found, please install it with `pip install tweepy`"
)
return tweepy
class TwitterTweetLoader(BaseLoader):
"""Twitter tweets loader.
Read tweets of user twitter handle.
First you need to go to
`https://developer.twitter.com/en/docs/twitter-api
/getting-started/getting-access-to-the-twitter-api`
to get your token. And create a v2 version of the app.
"""
def __init__(
self,
auth_handler: Union[OAuthHandler, OAuth2BearerHandler],
twitter_users: Sequence[str],
number_tweets: Optional[int] = 100,
):
self.auth = auth_handler
self.twitter_users = twitter_users
self.number_tweets = number_tweets
def load(self) -> List[Document]:
"""Load tweets."""
tweepy = _dependable_tweepy_import()
api = tweepy.API(self.auth, parser=tweepy.parsers.JSONParser())
results: List[Document] = []
for username in self.twitter_users:
tweets = api.user_timeline(screen_name=username, count=self.number_tweets)
user = api.get_user(screen_name=username)
docs = self._format_tweets(tweets, user)
results.extend(docs)
return results
def _format_tweets(
self, tweets: List[Dict[str, Any]], user_info: dict
) -> Iterable[Document]:
"""Format tweets into a string."""
for tweet in tweets:
metadata = {
"created_at": tweet["created_at"],
"user_info": user_info,
}
yield Document(
page_content=tweet["text"],
metadata=metadata,
)
@classmethod
def from_bearer_token(
cls,
oauth2_bearer_token: str,
twitter_users: Sequence[str],
number_tweets: Optional[int] = 100,
) -> TwitterTweetLoader:
"""Create a TwitterTweetLoader from OAuth2 bearer token."""
tweepy = _dependable_tweepy_import()
auth = tweepy.OAuth2BearerHandler(oauth2_bearer_token)
return cls(
auth_handler=auth,
twitter_users=twitter_users,
number_tweets=number_tweets,
)
@classmethod
def from_secrets(
cls,
access_token: str,
access_token_secret: str,
consumer_key: str,
consumer_secret: str,
twitter_users: Sequence[str],
number_tweets: Optional[int] = 100,
) -> TwitterTweetLoader:
"""Create a TwitterTweetLoader from access tokens and secrets."""
tweepy = _dependable_tweepy_import()
auth = tweepy.OAuthHandler(
access_token=access_token,
access_token_secret=access_token_secret,
consumer_key=consumer_key,
consumer_secret=consumer_secret,
)
return cls(
auth_handler=auth,
twitter_users=twitter_users,
number_tweets=number_tweets,
)