Refactor TelegramChatLoader and FacebookChatLoader classes and add tests (#3863)

This PR includes two main changes:

- Refactor the `TelegramChatLoader` and `FacebookChatLoader` classes by
removing the dependency on pandas and simplifying the message filtering
process.

- Add test cases for the `TelegramChatLoader` and `FacebookChatLoader`
classes. This test ensures that the class correctly loads and processes
the example chat data, providing better test coverage for this
functionality.
This commit is contained in:
hp0404 2023-05-04 01:59:19 +03:00 committed by GitHub
parent ea64b1716d
commit 374725a715
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 151 additions and 40 deletions

View File

@ -27,31 +27,16 @@ class FacebookChatLoader(BaseLoader):
def load(self) -> List[Document]:
"""Load documents."""
try:
import pandas as pd
except ImportError:
raise ValueError(
"pandas is needed for Facebook chat loader, "
"please install with `pip install pandas`"
)
p = Path(self.file_path)
with open(p, encoding="utf8") as f:
d = json.load(f)
normalized_messages = pd.json_normalize(d["messages"])
df_normalized_messages = pd.DataFrame(normalized_messages)
# Only keep plain text messages
# (no services, nor links, hashtags, code, bold ...)
df_filtered = df_normalized_messages[
(df_normalized_messages.content.apply(lambda x: type(x) == str))
]
df_filtered = df_filtered[["timestamp_ms", "content", "sender_name"]]
text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep="")
text = "".join(
concatenate_rows(message)
for message in d["messages"]
if message.get("content") and isinstance(message["content"], str)
)
metadata = {"source": str(p)}
return [Document(page_content=text, metadata=metadata)]

View File

@ -24,31 +24,16 @@ class TelegramChatLoader(BaseLoader):
def load(self) -> List[Document]:
"""Load documents."""
try:
import pandas as pd
except ImportError:
raise ValueError(
"pandas is needed for Telegram loader, "
"please install with `pip install pandas`"
)
p = Path(self.file_path)
with open(p, encoding="utf8") as f:
d = json.load(f)
normalized_messages = pd.json_normalize(d["messages"])
df_normalized_messages = pd.DataFrame(normalized_messages)
# Only keep plain text messages (no services, links, hashtags, code, bold...)
df_filtered = df_normalized_messages[
(df_normalized_messages.type == "message")
& (df_normalized_messages.text.apply(lambda x: type(x) == str))
]
df_filtered = df_filtered[["date", "text", "from"]]
text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep="")
text = "".join(
concatenate_rows(message)
for message in d["messages"]
if message["type"] == "message" and isinstance(message["text"], str)
)
metadata = {"source": str(p)}
return [Document(page_content=text, metadata=metadata)]

View File

@ -0,0 +1,28 @@
from pathlib import Path
from langchain.document_loaders import FacebookChatLoader
def test_facebook_chat_loader() -> None:
"""Test FacebookChatLoader."""
file_path = Path(__file__).parent.parent / "examples/facebook_chat.json"
loader = FacebookChatLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
assert docs[0].metadata["source"] == str(file_path)
assert docs[0].page_content == (
"User 2 on 2023-02-05 13:46:11: Bye!\n\n"
"User 1 on 2023-02-05 13:43:55: Oh no worries! Bye\n\n"
"User 2 on 2023-02-05 13:24:37: No Im sorry it was my mistake, "
"the blue one is not for sale\n\n"
"User 1 on 2023-02-05 13:05:40: I thought you were selling the blue one!\n\n"
"User 1 on 2023-02-05 13:05:09: Im not interested in this bag. "
"Im interested in the blue one!\n\n"
"User 2 on 2023-02-05 13:04:28: Here is $129\n\n"
"User 2 on 2023-02-05 13:04:05: Online is at least $100\n\n"
"User 1 on 2023-02-05 12:59:59: How much do you want?\n\n"
"User 2 on 2023-02-05 08:17:56: Goodmorning! $50 is too low.\n\n"
"User 1 on 2023-02-05 00:17:02: Hi! Im interested in your bag. "
"Im offering $50. Let me know if you are interested. Thanks!\n\n"
)

View File

@ -0,0 +1,18 @@
from pathlib import Path
from langchain.document_loaders import TelegramChatLoader
def test_telegram_chat_loader() -> None:
"""Test TelegramChatLoader."""
file_path = Path(__file__).parent.parent / "examples/telegram.json"
loader = TelegramChatLoader(str(file_path))
docs = loader.load()
assert len(docs) == 1
assert docs[0].metadata["source"] == str(file_path)
assert docs[0].page_content == (
"Henry on 2020-01-01T00:00:02: It's 2020...\n\n"
"Henry on 2020-01-01T00:00:04: Fireworks!\n\n"
"Grace 🧤 ðŸ\x8d on 2020-01-01T00:00:05: You're a minute late!\n\n"
)

View File

@ -0,0 +1,64 @@
{
"participants": [{"name": "User 1"}, {"name": "User 2"}],
"messages": [
{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"},
{
"sender_name": "User 1",
"timestamp_ms": 1675597435669,
"content": "Oh no worries! Bye"
},
{
"sender_name": "User 2",
"timestamp_ms": 1675596277579,
"content": "No Im sorry it was my mistake, the blue one is not for sale"
},
{
"sender_name": "User 1",
"timestamp_ms": 1675595140251,
"content": "I thought you were selling the blue one!"
},
{
"sender_name": "User 1",
"timestamp_ms": 1675595109305,
"content": "Im not interested in this bag. Im interested in the blue one!"
},
{
"sender_name": "User 2",
"timestamp_ms": 1675595068468,
"content": "Here is $129"
},
{
"sender_name": "User 2",
"timestamp_ms": 1675595060730,
"photos": [
{"uri": "url_of_some_picture.jpg", "creation_timestamp": 1675595059}
]
},
{
"sender_name": "User 2",
"timestamp_ms": 1675595045152,
"content": "Online is at least $100"
},
{
"sender_name": "User 1",
"timestamp_ms": 1675594799696,
"content": "How much do you want?"
},
{
"sender_name": "User 2",
"timestamp_ms": 1675577876645,
"content": "Goodmorning! $50 is too low."
},
{
"sender_name": "User 1",
"timestamp_ms": 1675549022673,
"content": "Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!"
}
],
"title": "User 1 and User 2 chat",
"is_still_participant": true,
"thread_path": "inbox/User 1 and User 2 chat",
"magic_words": [],
"image": {"uri": "image_of_the_chat.jpg", "creation_timestamp": 1675549016},
"joinable_mode": {"mode": 1, "link": ""}
}

View File

@ -0,0 +1,31 @@
{
"name": "Grace 🧤",
"type": "personal_chat",
"id": 2730825451,
"messages": [
{
"id": 1980499,
"type": "message",
"date": "2020-01-01T00:00:02",
"from": "Henry",
"from_id": 4325636679,
"text": "It's 2020..."
},
{
"id": 1980500,
"type": "message",
"date": "2020-01-01T00:00:04",
"from": "Henry",
"from_id": 4325636679,
"text": "Fireworks!"
},
{
"id": 1980501,
"type": "message",
"date": "2020-01-01T00:00:05",
"from": "Grace 🧤 🍒",
"from_id": 4720225552,
"text": "You're a minute late!"
}
]
}