mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
Harrison/fb loader (#1277)
Co-authored-by: Vairo Di Pasquale <vairo.dp@gmail.com>
This commit is contained in:
parent
bb53d9722d
commit
42167a1e24
@ -0,0 +1,64 @@
|
||||
{
|
||||
"participants": [{"name": "User 1"}, {"name": "User 2"}],
|
||||
"messages": [
|
||||
{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"},
|
||||
{
|
||||
"sender_name": "User 1",
|
||||
"timestamp_ms": 1675597435669,
|
||||
"content": "Oh no worries! Bye",
|
||||
},
|
||||
{
|
||||
"sender_name": "User 2",
|
||||
"timestamp_ms": 1675596277579,
|
||||
"content": "No Im sorry it was my mistake, the blue one is not for sale",
|
||||
},
|
||||
{
|
||||
"sender_name": "User 1",
|
||||
"timestamp_ms": 1675595140251,
|
||||
"content": "I thought you were selling the blue one!",
|
||||
},
|
||||
{
|
||||
"sender_name": "User 1",
|
||||
"timestamp_ms": 1675595109305,
|
||||
"content": "Im not interested in this bag. Im interested in the blue one!",
|
||||
},
|
||||
{
|
||||
"sender_name": "User 2",
|
||||
"timestamp_ms": 1675595068468,
|
||||
"content": "Here is $129",
|
||||
},
|
||||
{
|
||||
"sender_name": "User 2",
|
||||
"timestamp_ms": 1675595060730,
|
||||
"photos": [
|
||||
{"uri": "url_of_some_picture.jpg", "creation_timestamp": 1675595059}
|
||||
],
|
||||
},
|
||||
{
|
||||
"sender_name": "User 2",
|
||||
"timestamp_ms": 1675595045152,
|
||||
"content": "Online is at least $100",
|
||||
},
|
||||
{
|
||||
"sender_name": "User 1",
|
||||
"timestamp_ms": 1675594799696,
|
||||
"content": "How much do you want?",
|
||||
},
|
||||
{
|
||||
"sender_name": "User 2",
|
||||
"timestamp_ms": 1675577876645,
|
||||
"content": "Goodmorning! $50 is too low.",
|
||||
},
|
||||
{
|
||||
"sender_name": "User 1",
|
||||
"timestamp_ms": 1675549022673,
|
||||
"content": "Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!",
|
||||
},
|
||||
],
|
||||
"title": "User 1 and User 2 chat",
|
||||
"is_still_participant": true,
|
||||
"thread_path": "inbox/User 1 and User 2 chat",
|
||||
"magic_words": [],
|
||||
"image": {"uri": "image_of_the_chat.jpg", "creation_timestamp": 1675549016},
|
||||
"joinable_mode": {"mode": 1, "link": ""},
|
||||
}
|
77
docs/modules/document_loaders/examples/facebook_chat.ipynb
Normal file
77
docs/modules/document_loaders/examples/facebook_chat.ipynb
Normal file
@ -0,0 +1,77 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Facebook Chat\n",
|
||||
"\n",
|
||||
"This notebook covers how to load data from the Facebook Chats into a format that can be ingested into LangChain."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import FacebookChatLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"loader = FacebookChatLoader(\"example_data/facebook_chat.json\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[Document(page_content='User 2 on 2023-02-05 12:46:11: Bye!\\n\\nUser 1 on 2023-02-05 12:43:55: Oh no worries! Bye\\n\\nUser 2 on 2023-02-05 12:24:37: No Im sorry it was my mistake, the blue one is not for sale\\n\\nUser 1 on 2023-02-05 12:05:40: I thought you were selling the blue one!\\n\\nUser 1 on 2023-02-05 12:05:09: Im not interested in this bag. Im interested in the blue one!\\n\\nUser 2 on 2023-02-05 12:04:28: Here is $129\\n\\nUser 2 on 2023-02-05 12:04:05: Online is at least $100\\n\\nUser 1 on 2023-02-05 11:59:59: How much do you want?\\n\\nUser 2 on 2023-02-05 07:17:56: Goodmorning! $50 is too low.\\n\\nUser 1 on 2023-02-04 23:17:02: Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!\\n\\n', lookup_str='', metadata={'source': 'docs/modules/document_loaders/examples/example_data/facebook_chat.json'}, lookup_index=0)]"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader.load()"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.1"
|
||||
},
|
||||
"vscode": {
|
||||
"interpreter": {
|
||||
"hash": "384707f4965e853a82006e90614c2e1a578ea1f6eb0ee07a1dd78a657d37dd67"
|
||||
}
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -7,6 +7,7 @@ from langchain.document_loaders.directory import DirectoryLoader
|
||||
from langchain.document_loaders.docx import UnstructuredDocxLoader
|
||||
from langchain.document_loaders.email import UnstructuredEmailLoader
|
||||
from langchain.document_loaders.evernote import EverNoteLoader
|
||||
from langchain.document_loaders.facebook_chat import FacebookChatLoader
|
||||
from langchain.document_loaders.gcs_directory import GCSDirectoryLoader
|
||||
from langchain.document_loaders.gcs_file import GCSFileLoader
|
||||
from langchain.document_loaders.gitbook import GitbookLoader
|
||||
@ -72,5 +73,6 @@ __all__ = [
|
||||
"PDFMinerLoader",
|
||||
"TelegramChatLoader",
|
||||
"SRTLoader",
|
||||
"FacebookChatLoader",
|
||||
"NotebookLoader",
|
||||
]
|
||||
|
57
langchain/document_loaders/facebook_chat.py
Normal file
57
langchain/document_loaders/facebook_chat.py
Normal file
@ -0,0 +1,57 @@
|
||||
"""Loader that loads Facebook chat json dump."""
|
||||
import datetime
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
|
||||
|
||||
def concatenate_rows(row: dict) -> str:
|
||||
"""Combine message information in a readable format ready to be used."""
|
||||
sender = row["sender_name"]
|
||||
text = row["content"]
|
||||
date = datetime.datetime.fromtimestamp(row["timestamp_ms"] / 1000).strftime(
|
||||
"%Y-%m-%d %H:%M:%S"
|
||||
)
|
||||
return f"{sender} on {date}: {text}\n\n"
|
||||
|
||||
|
||||
class FacebookChatLoader(BaseLoader):
|
||||
"""Loader that loads Facebook messages json directory dump."""
|
||||
|
||||
def __init__(self, path: str):
|
||||
"""Initialize with path."""
|
||||
self.file_path = path
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"pandas is needed for Facebook chat loader, "
|
||||
"please install with `pip install pandas`"
|
||||
)
|
||||
p = Path(self.file_path)
|
||||
|
||||
with open(p, encoding="utf8") as f:
|
||||
d = json.load(f)
|
||||
|
||||
normalized_messages = pd.json_normalize(d["messages"])
|
||||
df_normalized_messages = pd.DataFrame(normalized_messages)
|
||||
|
||||
# Only keep plain text messages
|
||||
# (no services, nor links, hashtags, code, bold ...)
|
||||
df_filtered = df_normalized_messages[
|
||||
(df_normalized_messages.content.apply(lambda x: type(x) == str))
|
||||
]
|
||||
|
||||
df_filtered = df_filtered[["timestamp_ms", "content", "sender_name"]]
|
||||
|
||||
text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep="")
|
||||
|
||||
metadata = {"source": str(p)}
|
||||
|
||||
return [Document(page_content=text, metadata=metadata)]
|
Loading…
Reference in New Issue
Block a user