From 42167a1e2420b69e0596a861550c03eb246f1047 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Fri, 24 Feb 2023 07:22:48 -0800 Subject: [PATCH] Harrison/fb loader (#1277) Co-authored-by: Vairo Di Pasquale --- .../examples/example_data/facebook_chat.json | 64 +++++++++++++++ .../examples/facebook_chat.ipynb | 77 +++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/facebook_chat.py | 57 ++++++++++++++ 4 files changed, 200 insertions(+) create mode 100644 docs/modules/document_loaders/examples/example_data/facebook_chat.json create mode 100644 docs/modules/document_loaders/examples/facebook_chat.ipynb create mode 100644 langchain/document_loaders/facebook_chat.py diff --git a/docs/modules/document_loaders/examples/example_data/facebook_chat.json b/docs/modules/document_loaders/examples/example_data/facebook_chat.json new file mode 100644 index 00000000..b8baaa87 --- /dev/null +++ b/docs/modules/document_loaders/examples/example_data/facebook_chat.json @@ -0,0 +1,64 @@ +{ + "participants": [{"name": "User 1"}, {"name": "User 2"}], + "messages": [ + {"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}, + { + "sender_name": "User 1", + "timestamp_ms": 1675597435669, + "content": "Oh no worries! Bye", + }, + { + "sender_name": "User 2", + "timestamp_ms": 1675596277579, + "content": "No Im sorry it was my mistake, the blue one is not for sale", + }, + { + "sender_name": "User 1", + "timestamp_ms": 1675595140251, + "content": "I thought you were selling the blue one!", + }, + { + "sender_name": "User 1", + "timestamp_ms": 1675595109305, + "content": "Im not interested in this bag. Im interested in the blue one!", + }, + { + "sender_name": "User 2", + "timestamp_ms": 1675595068468, + "content": "Here is $129", + }, + { + "sender_name": "User 2", + "timestamp_ms": 1675595060730, + "photos": [ + {"uri": "url_of_some_picture.jpg", "creation_timestamp": 1675595059} + ], + }, + { + "sender_name": "User 2", + "timestamp_ms": 1675595045152, + "content": "Online is at least $100", + }, + { + "sender_name": "User 1", + "timestamp_ms": 1675594799696, + "content": "How much do you want?", + }, + { + "sender_name": "User 2", + "timestamp_ms": 1675577876645, + "content": "Goodmorning! $50 is too low.", + }, + { + "sender_name": "User 1", + "timestamp_ms": 1675549022673, + "content": "Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!", + }, + ], + "title": "User 1 and User 2 chat", + "is_still_participant": true, + "thread_path": "inbox/User 1 and User 2 chat", + "magic_words": [], + "image": {"uri": "image_of_the_chat.jpg", "creation_timestamp": 1675549016}, + "joinable_mode": {"mode": 1, "link": ""}, +} diff --git a/docs/modules/document_loaders/examples/facebook_chat.ipynb b/docs/modules/document_loaders/examples/facebook_chat.ipynb new file mode 100644 index 00000000..7c60f68f --- /dev/null +++ b/docs/modules/document_loaders/examples/facebook_chat.ipynb @@ -0,0 +1,77 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Facebook Chat\n", + "\n", + "This notebook covers how to load data from the Facebook Chats into a format that can be ingested into LangChain." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import FacebookChatLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "loader = FacebookChatLoader(\"example_data/facebook_chat.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='User 2 on 2023-02-05 12:46:11: Bye!\\n\\nUser 1 on 2023-02-05 12:43:55: Oh no worries! Bye\\n\\nUser 2 on 2023-02-05 12:24:37: No Im sorry it was my mistake, the blue one is not for sale\\n\\nUser 1 on 2023-02-05 12:05:40: I thought you were selling the blue one!\\n\\nUser 1 on 2023-02-05 12:05:09: Im not interested in this bag. Im interested in the blue one!\\n\\nUser 2 on 2023-02-05 12:04:28: Here is $129\\n\\nUser 2 on 2023-02-05 12:04:05: Online is at least $100\\n\\nUser 1 on 2023-02-05 11:59:59: How much do you want?\\n\\nUser 2 on 2023-02-05 07:17:56: Goodmorning! $50 is too low.\\n\\nUser 1 on 2023-02-04 23:17:02: Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!\\n\\n', lookup_str='', metadata={'source': 'docs/modules/document_loaders/examples/example_data/facebook_chat.json'}, lookup_index=0)]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + }, + "vscode": { + "interpreter": { + "hash": "384707f4965e853a82006e90614c2e1a578ea1f6eb0ee07a1dd78a657d37dd67" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 3b170f09..d54ed8e1 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -7,6 +7,7 @@ from langchain.document_loaders.directory import DirectoryLoader from langchain.document_loaders.docx import UnstructuredDocxLoader from langchain.document_loaders.email import UnstructuredEmailLoader from langchain.document_loaders.evernote import EverNoteLoader +from langchain.document_loaders.facebook_chat import FacebookChatLoader from langchain.document_loaders.gcs_directory import GCSDirectoryLoader from langchain.document_loaders.gcs_file import GCSFileLoader from langchain.document_loaders.gitbook import GitbookLoader @@ -72,5 +73,6 @@ __all__ = [ "PDFMinerLoader", "TelegramChatLoader", "SRTLoader", + "FacebookChatLoader", "NotebookLoader", ] diff --git a/langchain/document_loaders/facebook_chat.py b/langchain/document_loaders/facebook_chat.py new file mode 100644 index 00000000..d2dec9f0 --- /dev/null +++ b/langchain/document_loaders/facebook_chat.py @@ -0,0 +1,57 @@ +"""Loader that loads Facebook chat json dump.""" +import datetime +import json +from pathlib import Path +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +def concatenate_rows(row: dict) -> str: + """Combine message information in a readable format ready to be used.""" + sender = row["sender_name"] + text = row["content"] + date = datetime.datetime.fromtimestamp(row["timestamp_ms"] / 1000).strftime( + "%Y-%m-%d %H:%M:%S" + ) + return f"{sender} on {date}: {text}\n\n" + + +class FacebookChatLoader(BaseLoader): + """Loader that loads Facebook messages json directory dump.""" + + def __init__(self, path: str): + """Initialize with path.""" + self.file_path = path + + def load(self) -> List[Document]: + """Load documents.""" + try: + import pandas as pd + except ImportError: + raise ValueError( + "pandas is needed for Facebook chat loader, " + "please install with `pip install pandas`" + ) + p = Path(self.file_path) + + with open(p, encoding="utf8") as f: + d = json.load(f) + + normalized_messages = pd.json_normalize(d["messages"]) + df_normalized_messages = pd.DataFrame(normalized_messages) + + # Only keep plain text messages + # (no services, nor links, hashtags, code, bold ...) + df_filtered = df_normalized_messages[ + (df_normalized_messages.content.apply(lambda x: type(x) == str)) + ] + + df_filtered = df_filtered[["timestamp_ms", "content", "sender_name"]] + + text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep="") + + metadata = {"source": str(p)} + + return [Document(page_content=text, metadata=metadata)]