From 93c0514105ee499e5d6113819c1b507deeb5f0b2 Mon Sep 17 00:00:00 2001 From: Zander Chase <130414180+vowelparrot@users.noreply.github.com> Date: Mon, 17 Apr 2023 21:44:54 -0700 Subject: [PATCH] Add Twitter Tweet Loader (#3050) Reformatted version of #3022 --------- Co-authored-by: LiaoKong <568250549@qq.com> --- .../document_loaders/examples/twitter.ipynb | 114 ++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/twitter.py | 109 +++++++++++++++++ 3 files changed, 225 insertions(+) create mode 100644 docs/modules/indexes/document_loaders/examples/twitter.ipynb create mode 100644 langchain/document_loaders/twitter.py diff --git a/docs/modules/indexes/document_loaders/examples/twitter.ipynb b/docs/modules/indexes/document_loaders/examples/twitter.ipynb new file mode 100644 index 00000000..3713ad4f --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/twitter.ipynb @@ -0,0 +1,114 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "66a7777e", + "metadata": {}, + "source": [ + "# Twitter\n", + "\n", + "This loader fetches the text from the Tweets of a list of Twitter users, using the `tweepy` Python package.\n", + "You must initialize the loader with your Twitter API token, and you need to pass in the Twitter username you want to extract." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9ec8a3b3", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import TwitterTweetLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "43128d8d", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install tweepy" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "35d6809a", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "loader = TwitterTweetLoader.from_bearer_token(\n", + " oauth2_bearer_token=\"YOUR BEARER TOKEN\",\n", + " twitter_users=['elonmusk'],\n", + " number_tweets=50, # Default value is 100\n", + ")\n", + "\n", + "# Or load from access token and consumer keys\n", + "# loader = TwitterTweetLoader.from_secrets(\n", + "# access_token='YOUR ACCESS TOKEN',\n", + "# access_token_secret='YOUR ACCESS TOKEN SECRET',\n", + "# consumer_key='YOUR CONSUMER KEY',\n", + "# consumer_secret='YOUR CONSUMER SECRET',\n", + "# twitter_users=['elonmusk'],\n", + "# number_tweets=50,\n", + "# )" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false, + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='@MrAndyNgo @REI One store after another shutting down', metadata={'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'user_info': {'id': 44196397, 'id_str': '44196397', 'name': 'Elon Musk', 'screen_name': 'elonmusk', 'location': 'A Shortfall of Gravitas', 'profile_location': None, 'description': 'nothing', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 135528327, 'friends_count': 220, 'listed_count': 120478, 'created_at': 'Tue Jun 02 20:12:29 +0000 2009', 'favourites_count': 21285, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 24795, 'lang': None, 'status': {'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'id': 1648170947541704705, 'id_str': '1648170947541704705', 'text': '@MrAndyNgo @REI One store after another shutting down', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'MrAndyNgo', 'name': 'Andy Ng么 馃彸锔廫\u200d馃寛', 'id': 2835451658, 'id_str': '2835451658', 'indices': [0, 10]}, {'screen_name': 'REI', 'name': 'REI', 'id': 16583846, 'id_str': '16583846', 'indices': [11, 15]}], 'urls': []}, 'source': 'Twitter for iPhone', 'in_reply_to_status_id': 1648134341678051328, 'in_reply_to_status_id_str': '1648134341678051328', 'in_reply_to_user_id': 2835451658, 'in_reply_to_user_id_str': '2835451658', 'in_reply_to_screen_name': 'MrAndyNgo', 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 118, 'favorite_count': 1286, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/44196397/1576183471', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None, 'translator_type': 'none', 'withheld_in_countries': []}}),\n", + " Document(page_content='@KanekoaTheGreat @joshrogin @glennbeck Large ships are fundamentally vulnerable to ballistic (hypersonic) missiles', metadata={'created_at': 'Tue Apr 18 03:43:25 +0000 2023', 'user_info': {'id': 44196397, 'id_str': '44196397', 'name': 'Elon Musk', 'screen_name': 'elonmusk', 'location': 'A Shortfall of Gravitas', 'profile_location': None, 'description': 'nothing', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 135528327, 'friends_count': 220, 'listed_count': 120478, 'created_at': 'Tue Jun 02 20:12:29 +0000 2009', 'favourites_count': 21285, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 24795, 'lang': None, 'status': {'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'id': 1648170947541704705, 'id_str': '1648170947541704705', 'text': '@MrAndyNgo @REI One store after another shutting down', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'MrAndyNgo', 'name': 'Andy Ng么 馃彸锔廫\u200d馃寛', 'id': 2835451658, 'id_str': '2835451658', 'indices': [0, 10]}, {'screen_name': 'REI', 'name': 'REI', 'id': 16583846, 'id_str': '16583846', 'indices': [11, 15]}], 'urls': []}, 'source': 'Twitter for iPhone', 'in_reply_to_status_id': 1648134341678051328, 'in_reply_to_status_id_str': '1648134341678051328', 'in_reply_to_user_id': 2835451658, 'in_reply_to_user_id_str': '2835451658', 'in_reply_to_screen_name': 'MrAndyNgo', 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 118, 'favorite_count': 1286, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/44196397/1576183471', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None, 'translator_type': 'none', 'withheld_in_countries': []}}),\n", + " Document(page_content='@KanekoaTheGreat The Golden Rule', metadata={'created_at': 'Tue Apr 18 03:37:17 +0000 2023', 'user_info': {'id': 44196397, 'id_str': '44196397', 'name': 'Elon Musk', 'screen_name': 'elonmusk', 'location': 'A Shortfall of Gravitas', 'profile_location': None, 'description': 'nothing', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 135528327, 'friends_count': 220, 'listed_count': 120478, 'created_at': 'Tue Jun 02 20:12:29 +0000 2009', 'favourites_count': 21285, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 24795, 'lang': None, 'status': {'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'id': 1648170947541704705, 'id_str': '1648170947541704705', 'text': '@MrAndyNgo @REI One store after another shutting down', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'MrAndyNgo', 'name': 'Andy Ng么 馃彸锔廫\u200d馃寛', 'id': 2835451658, 'id_str': '2835451658', 'indices': [0, 10]}, {'screen_name': 'REI', 'name': 'REI', 'id': 16583846, 'id_str': '16583846', 'indices': [11, 15]}], 'urls': []}, 'source': 'Twitter for iPhone', 'in_reply_to_status_id': 1648134341678051328, 'in_reply_to_status_id_str': '1648134341678051328', 'in_reply_to_user_id': 2835451658, 'in_reply_to_user_id_str': '2835451658', 'in_reply_to_screen_name': 'MrAndyNgo', 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 118, 'favorite_count': 1286, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/44196397/1576183471', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None, 'translator_type': 'none', 'withheld_in_countries': []}}),\n", + " Document(page_content='@KanekoaTheGreat 馃', metadata={'created_at': 'Tue Apr 18 03:35:48 +0000 2023', 'user_info': {'id': 44196397, 'id_str': '44196397', 'name': 'Elon Musk', 'screen_name': 'elonmusk', 'location': 'A Shortfall of Gravitas', 'profile_location': None, 'description': 'nothing', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 135528327, 'friends_count': 220, 'listed_count': 120478, 'created_at': 'Tue Jun 02 20:12:29 +0000 2009', 'favourites_count': 21285, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 24795, 'lang': None, 'status': {'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'id': 1648170947541704705, 'id_str': '1648170947541704705', 'text': '@MrAndyNgo @REI One store after another shutting down', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'MrAndyNgo', 'name': 'Andy Ng么 馃彸锔廫\u200d馃寛', 'id': 2835451658, 'id_str': '2835451658', 'indices': [0, 10]}, {'screen_name': 'REI', 'name': 'REI', 'id': 16583846, 'id_str': '16583846', 'indices': [11, 15]}], 'urls': []}, 'source': 'Twitter for iPhone', 'in_reply_to_status_id': 1648134341678051328, 'in_reply_to_status_id_str': '1648134341678051328', 'in_reply_to_user_id': 2835451658, 'in_reply_to_user_id_str': '2835451658', 'in_reply_to_screen_name': 'MrAndyNgo', 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 118, 'favorite_count': 1286, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/44196397/1576183471', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None, 'translator_type': 'none', 'withheld_in_countries': []}}),\n", + " Document(page_content='@TRHLofficial What鈥檚 he talking about and why is it sponsored by Erik鈥檚 son?', metadata={'created_at': 'Tue Apr 18 03:32:17 +0000 2023', 'user_info': {'id': 44196397, 'id_str': '44196397', 'name': 'Elon Musk', 'screen_name': 'elonmusk', 'location': 'A Shortfall of Gravitas', 'profile_location': None, 'description': 'nothing', 'url': None, 'entities': {'description': {'urls': []}}, 'protected': False, 'followers_count': 135528327, 'friends_count': 220, 'listed_count': 120478, 'created_at': 'Tue Jun 02 20:12:29 +0000 2009', 'favourites_count': 21285, 'utc_offset': None, 'time_zone': None, 'geo_enabled': False, 'verified': False, 'statuses_count': 24795, 'lang': None, 'status': {'created_at': 'Tue Apr 18 03:45:50 +0000 2023', 'id': 1648170947541704705, 'id_str': '1648170947541704705', 'text': '@MrAndyNgo @REI One store after another shutting down', 'truncated': False, 'entities': {'hashtags': [], 'symbols': [], 'user_mentions': [{'screen_name': 'MrAndyNgo', 'name': 'Andy Ng么 馃彸锔廫\u200d馃寛', 'id': 2835451658, 'id_str': '2835451658', 'indices': [0, 10]}, {'screen_name': 'REI', 'name': 'REI', 'id': 16583846, 'id_str': '16583846', 'indices': [11, 15]}], 'urls': []}, 'source': 'Twitter for iPhone', 'in_reply_to_status_id': 1648134341678051328, 'in_reply_to_status_id_str': '1648134341678051328', 'in_reply_to_user_id': 2835451658, 'in_reply_to_user_id_str': '2835451658', 'in_reply_to_screen_name': 'MrAndyNgo', 'geo': None, 'coordinates': None, 'place': None, 'contributors': None, 'is_quote_status': False, 'retweet_count': 118, 'favorite_count': 1286, 'favorited': False, 'retweeted': False, 'lang': 'en'}, 'contributors_enabled': False, 'is_translator': False, 'is_translation_enabled': False, 'profile_background_color': 'C0DEED', 'profile_background_image_url': 'http://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_image_url_https': 'https://abs.twimg.com/images/themes/theme1/bg.png', 'profile_background_tile': False, 'profile_image_url': 'http://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_image_url_https': 'https://pbs.twimg.com/profile_images/1590968738358079488/IY9Gx6Ok_normal.jpg', 'profile_banner_url': 'https://pbs.twimg.com/profile_banners/44196397/1576183471', 'profile_link_color': '0084B4', 'profile_sidebar_border_color': 'C0DEED', 'profile_sidebar_fill_color': 'DDEEF6', 'profile_text_color': '333333', 'profile_use_background_image': True, 'has_extended_profile': True, 'default_profile': False, 'default_profile_image': False, 'following': None, 'follow_request_sent': None, 'notifications': None, 'translator_type': 'none', 'withheld_in_countries': []}})]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "documents = loader.load()\n", + "documents[:5]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index c47fb6cf..d12318f5 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -62,6 +62,7 @@ from langchain.document_loaders.slack_directory import SlackDirectoryLoader from langchain.document_loaders.srt import SRTLoader from langchain.document_loaders.telegram import TelegramChatLoader from langchain.document_loaders.text import TextLoader +from langchain.document_loaders.twitter import TwitterTweetLoader from langchain.document_loaders.unstructured import ( UnstructuredFileIOLoader, UnstructuredFileLoader, @@ -147,5 +148,6 @@ __all__ = [ "BiliBiliLoader", "SlackDirectoryLoader", "GitLoader", + "TwitterTweetLoader", "ImageCaptionLoader", ] diff --git a/langchain/document_loaders/twitter.py b/langchain/document_loaders/twitter.py new file mode 100644 index 00000000..2b1afd77 --- /dev/null +++ b/langchain/document_loaders/twitter.py @@ -0,0 +1,109 @@ +"""Twitter document loader.""" +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Sequence, Union + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + +if TYPE_CHECKING: + import tweepy + from tweepy import OAuth2BearerHandler, OAuthHandler + + +def _dependable_tweepy_import() -> tweepy: + try: + import tweepy + except ImportError: + raise ValueError( + "tweepy package not found, please install it with `pip install tweepy`" + ) + return tweepy + + +class TwitterTweetLoader(BaseLoader): + """Twitter tweets loader. + Read tweets of user twitter handle. + + First you need to go to + `https://developer.twitter.com/en/docs/twitter-api + /getting-started/getting-access-to-the-twitter-api` + to get your token. And create a v2 version of the app. + """ + + def __init__( + self, + auth_handler: Union[OAuthHandler, OAuth2BearerHandler], + twitter_users: Sequence[str], + number_tweets: Optional[int] = 100, + ): + self.auth = auth_handler + self.twitter_users = twitter_users + self.number_tweets = number_tweets + + def load(self) -> List[Document]: + """Load tweets.""" + tweepy = _dependable_tweepy_import() + api = tweepy.API(self.auth, parser=tweepy.parsers.JSONParser()) + + results: List[Document] = [] + for username in self.twitter_users: + tweets = api.user_timeline(screen_name=username, count=self.number_tweets) + user = api.get_user(screen_name=username) + docs = self._format_tweets(tweets, user) + results.extend(docs) + return results + + def _format_tweets( + self, tweets: List[Dict[str, Any]], user_info: dict + ) -> Iterable[Document]: + """Format tweets into a string.""" + for tweet in tweets: + metadata = { + "created_at": tweet["created_at"], + "user_info": user_info, + } + yield Document( + page_content=tweet["text"], + metadata=metadata, + ) + + @classmethod + def from_bearer_token( + cls, + oauth2_bearer_token: str, + twitter_users: Sequence[str], + number_tweets: Optional[int] = 100, + ) -> TwitterTweetLoader: + """Create a TwitterTweetLoader from OAuth2 bearer token.""" + tweepy = _dependable_tweepy_import() + auth = tweepy.OAuth2BearerHandler(oauth2_bearer_token) + return cls( + auth_handler=auth, + twitter_users=twitter_users, + number_tweets=number_tweets, + ) + + @classmethod + def from_secrets( + cls, + access_token: str, + access_token_secret: str, + consumer_key: str, + consumer_secret: str, + twitter_users: Sequence[str], + number_tweets: Optional[int] = 100, + ) -> TwitterTweetLoader: + """Create a TwitterTweetLoader from access tokens and secrets.""" + tweepy = _dependable_tweepy_import() + auth = tweepy.OAuthHandler( + access_token=access_token, + access_token_secret=access_token_secret, + consumer_key=consumer_key, + consumer_secret=consumer_secret, + ) + return cls( + auth_handler=auth, + twitter_users=twitter_users, + number_tweets=number_tweets, + )