From 93c0514105ee499e5d6113819c1b507deeb5f0b2 Mon Sep 17 00:00:00 2001
From: Zander Chase <130414180+vowelparrot@users.noreply.github.com>
Date: Mon, 17 Apr 2023 21:44:54 -0700
Subject: [PATCH] Add Twitter Tweet Loader (#3050)
Reformatted version of #3022
---------
Co-authored-by: LiaoKong <568250549@qq.com>
---
.../document_loaders/examples/twitter.ipynb | 114 ++++++++++++++++++
langchain/document_loaders/__init__.py | 2 +
langchain/document_loaders/twitter.py | 109 +++++++++++++++++
3 files changed, 225 insertions(+)
create mode 100644 docs/modules/indexes/document_loaders/examples/twitter.ipynb
create mode 100644 langchain/document_loaders/twitter.py
diff --git a/docs/modules/indexes/document_loaders/examples/twitter.ipynb b/docs/modules/indexes/document_loaders/examples/twitter.ipynb
new file mode 100644
index 00000000..3713ad4f
--- /dev/null
+++ b/docs/modules/indexes/document_loaders/examples/twitter.ipynb
@@ -0,0 +1,114 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "66a7777e",
+ "metadata": {},
+ "source": [
+ "# Twitter\n",
+ "\n",
+ "This loader fetches the text from the Tweets of a list of Twitter users, using the `tweepy` Python package.\n",
+ "You must initialize the loader with your Twitter API token, and you need to pass in the Twitter username you want to extract."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "9ec8a3b3",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "from langchain.document_loaders import TwitterTweetLoader"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "43128d8d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#!pip install tweepy"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "35d6809a",
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "loader = TwitterTweetLoader.from_bearer_token(\n",
+ " oauth2_bearer_token=\"YOUR BEARER TOKEN\",\n",
+ " twitter_users=['elonmusk'],\n",
+ " number_tweets=50, # Default value is 100\n",
+ ")\n",
+ "\n",
+ "# Or load from access token and consumer keys\n",
+ "# loader = TwitterTweetLoader.from_secrets(\n",
+ "# access_token='YOUR ACCESS TOKEN',\n",
+ "# access_token_secret='YOUR ACCESS TOKEN SECRET',\n",
+ "# consumer_key='YOUR CONSUMER KEY',\n",
+ "# consumer_secret='YOUR CONSUMER SECRET',\n",
+ "# twitter_users=['elonmusk'],\n",
+ "# number_tweets=50,\n",
+ "# )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false,
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "[Document(page_content='@MrAndyNgo @REI One store after another shutting down', metadata={'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'user_info': {'id': 44196397, 'id_str': '44196397', 'name': 'Elon Musk', 'screen_name': 'elonmusk', 'location': 'A Shortfall of Gravitas', 'profile_location': None, 'description': 'nothing', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 135528327, 'friends_count': 220, 'listed_count': 120478, 'created_at': 'Tue Jun 02 20:12:29 +0000 2009', 'favourites_count': 21285, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 24795, 'lang': None, 'status': {'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'id': 1648170947541704705, 'id_str': '1648170947541704705', 'text': '@MrAndyNgo @REI One store after another shutting down', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'MrAndyNgo', 'name': 'Andy Ng么 馃彸锔廫\u200d馃寛', 'id': 2835451658, 'id_str': '2835451658', 'indices': [0, 10]}, {'screen_name': 'REI', 'name': 'REI', 'id': 16583846, 'id_str': '16583846', 'indices': [11, 15]}], 'urls': []}, 'source': 'Twitter for iPhone', 'in_reply_to_status_id': 1648134341678051328, 'in_reply_to_status_id_str': '1648134341678051328', 'in_reply_to_user_id': 2835451658, 'in_reply_to_user_id_str': '2835451658', 'in_reply_to_screen_name': 'MrAndyNgo', 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 118, 'favorite_count': 1286, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/44196397/1576183471', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None, 'translator_type': 'none', 'withheld_in_countries': []}}),\n",
+ " Document(page_content='@KanekoaTheGreat @joshrogin @glennbeck Large ships are fundamentally vulnerable to ballistic (hypersonic) missiles', metadata={'created_at': 'Tue Apr 18 03:43:25 +0000 2023', 'user_info': {'id': 44196397, 'id_str': '44196397', 'name': 'Elon Musk', 'screen_name': 'elonmusk', 'location': 'A Shortfall of Gravitas', 'profile_location': None, 'description': 'nothing', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 135528327, 'friends_count': 220, 'listed_count': 120478, 'created_at': 'Tue Jun 02 20:12:29 +0000 2009', 'favourites_count': 21285, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 24795, 'lang': None, 'status': {'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'id': 1648170947541704705, 'id_str': '1648170947541704705', 'text': '@MrAndyNgo @REI One store after another shutting down', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'MrAndyNgo', 'name': 'Andy Ng么 馃彸锔廫\u200d馃寛', 'id': 2835451658, 'id_str': '2835451658', 'indices': [0, 10]}, {'screen_name': 'REI', 'name': 'REI', 'id': 16583846, 'id_str': '16583846', 'indices': [11, 15]}], 'urls': []}, 'source': 'Twitter for iPhone', 'in_reply_to_status_id': 1648134341678051328, 'in_reply_to_status_id_str': '1648134341678051328', 'in_reply_to_user_id': 2835451658, 'in_reply_to_user_id_str': '2835451658', 'in_reply_to_screen_name': 'MrAndyNgo', 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 118, 'favorite_count': 1286, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/44196397/1576183471', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None, 'translator_type': 'none', 'withheld_in_countries': []}}),\n",
+ " Document(page_content='@KanekoaTheGreat The Golden Rule', metadata={'created_at': 'Tue Apr 18 03:37:17 +0000 2023', 'user_info': {'id': 44196397, 'id_str': '44196397', 'name': 'Elon Musk', 'screen_name': 'elonmusk', 'location': 'A Shortfall of Gravitas', 'profile_location': None, 'description': 'nothing', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 135528327, 'friends_count': 220, 'listed_count': 120478, 'created_at': 'Tue Jun 02 20:12:29 +0000 2009', 'favourites_count': 21285, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 24795, 'lang': None, 'status': {'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'id': 1648170947541704705, 'id_str': '1648170947541704705', 'text': '@MrAndyNgo @REI One store after another shutting down', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'MrAndyNgo', 'name': 'Andy Ng么 馃彸锔廫\u200d馃寛', 'id': 2835451658, 'id_str': '2835451658', 'indices': [0, 10]}, {'screen_name': 'REI', 'name': 'REI', 'id': 16583846, 'id_str': '16583846', 'indices': [11, 15]}], 'urls': []}, 'source': 'Twitter for iPhone', 'in_reply_to_status_id': 1648134341678051328, 'in_reply_to_status_id_str': '1648134341678051328', 'in_reply_to_user_id': 2835451658, 'in_reply_to_user_id_str': '2835451658', 'in_reply_to_screen_name': 'MrAndyNgo', 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 118, 'favorite_count': 1286, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/44196397/1576183471', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None, 'translator_type': 'none', 'withheld_in_countries': []}}),\n",
+ " Document(page_content='@KanekoaTheGreat 馃', metadata={'created_at': 'Tue Apr 18 03:35:48 +0000 2023', 'user_info': {'id': 44196397, 'id_str': '44196397', 'name': 'Elon Musk', 'screen_name': 'elonmusk', 'location': 'A Shortfall of Gravitas', 'profile_location': None, 'description': 'nothing', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 135528327, 'friends_count': 220, 'listed_count': 120478, 'created_at': 'Tue Jun 02 20:12:29 +0000 2009', 'favourites_count': 21285, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 24795, 'lang': None, 'status': {'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'id': 1648170947541704705, 'id_str': '1648170947541704705', 'text': '@MrAndyNgo @REI One store after another shutting down', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'MrAndyNgo', 'name': 'Andy Ng么 馃彸锔廫\u200d馃寛', 'id': 2835451658, 'id_str': '2835451658', 'indices': [0, 10]}, {'screen_name': 'REI', 'name': 'REI', 'id': 16583846, 'id_str': '16583846', 'indices': [11, 15]}], 'urls': []}, 'source': 'Twitter for iPhone', 'in_reply_to_status_id': 1648134341678051328, 'in_reply_to_status_id_str': '1648134341678051328', 'in_reply_to_user_id': 2835451658, 'in_reply_to_user_id_str': '2835451658', 'in_reply_to_screen_name': 'MrAndyNgo', 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 118, 'favorite_count': 1286, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/44196397/1576183471', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None, 'translator_type': 'none', 'withheld_in_countries': []}}),\n",
+ " Document(page_content='@TRHLofficial What鈥檚 he talking about and why is it sponsored by Erik鈥檚 son?', metadata={'created_at': 'Tue Apr 18 03:32:17 +0000 2023', 'user_info': {'id': 44196397, 'id_str': '44196397', 'name': 'Elon Musk', 'screen_name': 'elonmusk', 'location': 'A Shortfall of Gravitas', 'profile_location': None, 'description': 'nothing', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 135528327, 'friends_count': 220, 'listed_count': 120478, 'created_at': 'Tue Jun 02 20:12:29 +0000 2009', 'favourites_count': 21285, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 24795, 'lang': None, 'status': {'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'id': 1648170947541704705, 'id_str': '1648170947541704705', 'text': '@MrAndyNgo @REI One store after another shutting down', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'MrAndyNgo', 'name': 'Andy Ng么 馃彸锔廫\u200d馃寛', 'id': 2835451658, 'id_str': '2835451658', 'indices': [0, 10]}, {'screen_name': 'REI', 'name': 'REI', 'id': 16583846, 'id_str': '16583846', 'indices': [11, 15]}], 'urls': []}, 'source': 'Twitter for iPhone', 'in_reply_to_status_id': 1648134341678051328, 'in_reply_to_status_id_str': '1648134341678051328', 'in_reply_to_user_id': 2835451658, 'in_reply_to_user_id_str': '2835451658', 'in_reply_to_screen_name': 'MrAndyNgo', 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 118, 'favorite_count': 1286, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/44196397/1576183471', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None, 'translator_type': 'none', 'withheld_in_countries': []}})]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "documents = loader.load()\n",
+ "documents[:5]"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.2"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
index c47fb6cf..d12318f5 100644
--- a/langchain/document_loaders/__init__.py
+++ b/langchain/document_loaders/__init__.py
@@ -62,6 +62,7 @@ from langchain.document_loaders.slack_directory import SlackDirectoryLoader
from langchain.document_loaders.srt import SRTLoader
from langchain.document_loaders.telegram import TelegramChatLoader
from langchain.document_loaders.text import TextLoader
+from langchain.document_loaders.twitter import TwitterTweetLoader
from langchain.document_loaders.unstructured import (
UnstructuredFileIOLoader,
UnstructuredFileLoader,
@@ -147,5 +148,6 @@ __all__ = [
"BiliBiliLoader",
"SlackDirectoryLoader",
"GitLoader",
+ "TwitterTweetLoader",
"ImageCaptionLoader",
]
diff --git a/langchain/document_loaders/twitter.py b/langchain/document_loaders/twitter.py
new file mode 100644
index 00000000..2b1afd77
--- /dev/null
+++ b/langchain/document_loaders/twitter.py
@@ -0,0 +1,109 @@
+"""Twitter document loader."""
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+if TYPE_CHECKING:
+ import tweepy
+ from tweepy import OAuth2BearerHandler, OAuthHandler
+
+
+def _dependable_tweepy_import() -> tweepy:
+ try:
+ import tweepy
+ except ImportError:
+ raise ValueError(
+ "tweepy package not found, please install it with `pip install tweepy`"
+ )
+ return tweepy
+
+
+class TwitterTweetLoader(BaseLoader):
+ """Twitter tweets loader.
+ Read tweets of user twitter handle.
+
+ First you need to go to
+ `https://developer.twitter.com/en/docs/twitter-api
+ /getting-started/getting-access-to-the-twitter-api`
+ to get your token. And create a v2 version of the app.
+ """
+
+ def __init__(
+ self,
+ auth_handler: Union[OAuthHandler, OAuth2BearerHandler],
+ twitter_users: Sequence[str],
+ number_tweets: Optional[int] = 100,
+ ):
+ self.auth = auth_handler
+ self.twitter_users = twitter_users
+ self.number_tweets = number_tweets
+
+ def load(self) -> List[Document]:
+ """Load tweets."""
+ tweepy = _dependable_tweepy_import()
+ api = tweepy.API(self.auth, parser=tweepy.parsers.JSONParser())
+
+ results: List[Document] = []
+ for username in self.twitter_users:
+ tweets = api.user_timeline(screen_name=username, count=self.number_tweets)
+ user = api.get_user(screen_name=username)
+ docs = self._format_tweets(tweets, user)
+ results.extend(docs)
+ return results
+
+ def _format_tweets(
+ self, tweets: List[Dict[str, Any]], user_info: dict
+ ) -> Iterable[Document]:
+ """Format tweets into a string."""
+ for tweet in tweets:
+ metadata = {
+ "created_at": tweet["created_at"],
+ "user_info": user_info,
+ }
+ yield Document(
+ page_content=tweet["text"],
+ metadata=metadata,
+ )
+
+ @classmethod
+ def from_bearer_token(
+ cls,
+ oauth2_bearer_token: str,
+ twitter_users: Sequence[str],
+ number_tweets: Optional[int] = 100,
+ ) -> TwitterTweetLoader:
+ """Create a TwitterTweetLoader from OAuth2 bearer token."""
+ tweepy = _dependable_tweepy_import()
+ auth = tweepy.OAuth2BearerHandler(oauth2_bearer_token)
+ return cls(
+ auth_handler=auth,
+ twitter_users=twitter_users,
+ number_tweets=number_tweets,
+ )
+
+ @classmethod
+ def from_secrets(
+ cls,
+ access_token: str,
+ access_token_secret: str,
+ consumer_key: str,
+ consumer_secret: str,
+ twitter_users: Sequence[str],
+ number_tweets: Optional[int] = 100,
+ ) -> TwitterTweetLoader:
+ """Create a TwitterTweetLoader from access tokens and secrets."""
+ tweepy = _dependable_tweepy_import()
+ auth = tweepy.OAuthHandler(
+ access_token=access_token,
+ access_token_secret=access_token_secret,
+ consumer_key=consumer_key,
+ consumer_secret=consumer_secret,
+ )
+ return cls(
+ auth_handler=auth,
+ twitter_users=twitter_users,
+ number_tweets=number_tweets,
+ )