forked from Archives/langchain
Refactor TelegramChatLoader and FacebookChatLoader classes and add tests (#3863)
This PR includes two main changes: - Refactor the `TelegramChatLoader` and `FacebookChatLoader` classes by removing the dependency on pandas and simplifying the message filtering process. - Add test cases for the `TelegramChatLoader` and `FacebookChatLoader` classes. This test ensures that the class correctly loads and processes the example chat data, providing better test coverage for this functionality.
This commit is contained in:
parent
ea64b1716d
commit
374725a715
@ -27,31 +27,16 @@ class FacebookChatLoader(BaseLoader):
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"pandas is needed for Facebook chat loader, "
|
||||
"please install with `pip install pandas`"
|
||||
)
|
||||
p = Path(self.file_path)
|
||||
|
||||
with open(p, encoding="utf8") as f:
|
||||
d = json.load(f)
|
||||
|
||||
normalized_messages = pd.json_normalize(d["messages"])
|
||||
df_normalized_messages = pd.DataFrame(normalized_messages)
|
||||
|
||||
# Only keep plain text messages
|
||||
# (no services, nor links, hashtags, code, bold ...)
|
||||
df_filtered = df_normalized_messages[
|
||||
(df_normalized_messages.content.apply(lambda x: type(x) == str))
|
||||
]
|
||||
|
||||
df_filtered = df_filtered[["timestamp_ms", "content", "sender_name"]]
|
||||
|
||||
text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep="")
|
||||
|
||||
text = "".join(
|
||||
concatenate_rows(message)
|
||||
for message in d["messages"]
|
||||
if message.get("content") and isinstance(message["content"], str)
|
||||
)
|
||||
metadata = {"source": str(p)}
|
||||
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
|
@ -24,31 +24,16 @@ class TelegramChatLoader(BaseLoader):
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
"""Load documents."""
|
||||
try:
|
||||
import pandas as pd
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"pandas is needed for Telegram loader, "
|
||||
"please install with `pip install pandas`"
|
||||
)
|
||||
p = Path(self.file_path)
|
||||
|
||||
with open(p, encoding="utf8") as f:
|
||||
d = json.load(f)
|
||||
|
||||
normalized_messages = pd.json_normalize(d["messages"])
|
||||
df_normalized_messages = pd.DataFrame(normalized_messages)
|
||||
|
||||
# Only keep plain text messages (no services, links, hashtags, code, bold...)
|
||||
df_filtered = df_normalized_messages[
|
||||
(df_normalized_messages.type == "message")
|
||||
& (df_normalized_messages.text.apply(lambda x: type(x) == str))
|
||||
]
|
||||
|
||||
df_filtered = df_filtered[["date", "text", "from"]]
|
||||
|
||||
text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep="")
|
||||
|
||||
text = "".join(
|
||||
concatenate_rows(message)
|
||||
for message in d["messages"]
|
||||
if message["type"] == "message" and isinstance(message["text"], str)
|
||||
)
|
||||
metadata = {"source": str(p)}
|
||||
|
||||
return [Document(page_content=text, metadata=metadata)]
|
||||
|
@ -0,0 +1,28 @@
|
||||
from pathlib import Path
|
||||
|
||||
from langchain.document_loaders import FacebookChatLoader
|
||||
|
||||
|
||||
def test_facebook_chat_loader() -> None:
|
||||
"""Test FacebookChatLoader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/facebook_chat.json"
|
||||
loader = FacebookChatLoader(str(file_path))
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
assert docs[0].metadata["source"] == str(file_path)
|
||||
assert docs[0].page_content == (
|
||||
"User 2 on 2023-02-05 13:46:11: Bye!\n\n"
|
||||
"User 1 on 2023-02-05 13:43:55: Oh no worries! Bye\n\n"
|
||||
"User 2 on 2023-02-05 13:24:37: No Im sorry it was my mistake, "
|
||||
"the blue one is not for sale\n\n"
|
||||
"User 1 on 2023-02-05 13:05:40: I thought you were selling the blue one!\n\n"
|
||||
"User 1 on 2023-02-05 13:05:09: Im not interested in this bag. "
|
||||
"Im interested in the blue one!\n\n"
|
||||
"User 2 on 2023-02-05 13:04:28: Here is $129\n\n"
|
||||
"User 2 on 2023-02-05 13:04:05: Online is at least $100\n\n"
|
||||
"User 1 on 2023-02-05 12:59:59: How much do you want?\n\n"
|
||||
"User 2 on 2023-02-05 08:17:56: Goodmorning! $50 is too low.\n\n"
|
||||
"User 1 on 2023-02-05 00:17:02: Hi! Im interested in your bag. "
|
||||
"Im offering $50. Let me know if you are interested. Thanks!\n\n"
|
||||
)
|
18
tests/integration_tests/document_loaders/test_telegram.py
Normal file
18
tests/integration_tests/document_loaders/test_telegram.py
Normal file
@ -0,0 +1,18 @@
|
||||
from pathlib import Path
|
||||
|
||||
from langchain.document_loaders import TelegramChatLoader
|
||||
|
||||
|
||||
def test_telegram_chat_loader() -> None:
|
||||
"""Test TelegramChatLoader."""
|
||||
file_path = Path(__file__).parent.parent / "examples/telegram.json"
|
||||
loader = TelegramChatLoader(str(file_path))
|
||||
docs = loader.load()
|
||||
|
||||
assert len(docs) == 1
|
||||
assert docs[0].metadata["source"] == str(file_path)
|
||||
assert docs[0].page_content == (
|
||||
"Henry on 2020-01-01T00:00:02: It's 2020...\n\n"
|
||||
"Henry on 2020-01-01T00:00:04: Fireworks!\n\n"
|
||||
"Grace 🧤 ðŸ\x8d’ on 2020-01-01T00:00:05: You're a minute late!\n\n"
|
||||
)
|
64
tests/integration_tests/examples/facebook_chat.json
Normal file
64
tests/integration_tests/examples/facebook_chat.json
Normal file
@ -0,0 +1,64 @@
|
||||
{
|
||||
"participants": [{"name": "User 1"}, {"name": "User 2"}],
|
||||
"messages": [
|
||||
{"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"},
|
||||
{
|
||||
"sender_name": "User 1",
|
||||
"timestamp_ms": 1675597435669,
|
||||
"content": "Oh no worries! Bye"
|
||||
},
|
||||
{
|
||||
"sender_name": "User 2",
|
||||
"timestamp_ms": 1675596277579,
|
||||
"content": "No Im sorry it was my mistake, the blue one is not for sale"
|
||||
},
|
||||
{
|
||||
"sender_name": "User 1",
|
||||
"timestamp_ms": 1675595140251,
|
||||
"content": "I thought you were selling the blue one!"
|
||||
},
|
||||
{
|
||||
"sender_name": "User 1",
|
||||
"timestamp_ms": 1675595109305,
|
||||
"content": "Im not interested in this bag. Im interested in the blue one!"
|
||||
},
|
||||
{
|
||||
"sender_name": "User 2",
|
||||
"timestamp_ms": 1675595068468,
|
||||
"content": "Here is $129"
|
||||
},
|
||||
{
|
||||
"sender_name": "User 2",
|
||||
"timestamp_ms": 1675595060730,
|
||||
"photos": [
|
||||
{"uri": "url_of_some_picture.jpg", "creation_timestamp": 1675595059}
|
||||
]
|
||||
},
|
||||
{
|
||||
"sender_name": "User 2",
|
||||
"timestamp_ms": 1675595045152,
|
||||
"content": "Online is at least $100"
|
||||
},
|
||||
{
|
||||
"sender_name": "User 1",
|
||||
"timestamp_ms": 1675594799696,
|
||||
"content": "How much do you want?"
|
||||
},
|
||||
{
|
||||
"sender_name": "User 2",
|
||||
"timestamp_ms": 1675577876645,
|
||||
"content": "Goodmorning! $50 is too low."
|
||||
},
|
||||
{
|
||||
"sender_name": "User 1",
|
||||
"timestamp_ms": 1675549022673,
|
||||
"content": "Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!"
|
||||
}
|
||||
],
|
||||
"title": "User 1 and User 2 chat",
|
||||
"is_still_participant": true,
|
||||
"thread_path": "inbox/User 1 and User 2 chat",
|
||||
"magic_words": [],
|
||||
"image": {"uri": "image_of_the_chat.jpg", "creation_timestamp": 1675549016},
|
||||
"joinable_mode": {"mode": 1, "link": ""}
|
||||
}
|
31
tests/integration_tests/examples/telegram.json
Normal file
31
tests/integration_tests/examples/telegram.json
Normal file
@ -0,0 +1,31 @@
|
||||
{
|
||||
"name": "Grace 🧤",
|
||||
"type": "personal_chat",
|
||||
"id": 2730825451,
|
||||
"messages": [
|
||||
{
|
||||
"id": 1980499,
|
||||
"type": "message",
|
||||
"date": "2020-01-01T00:00:02",
|
||||
"from": "Henry",
|
||||
"from_id": 4325636679,
|
||||
"text": "It's 2020..."
|
||||
},
|
||||
{
|
||||
"id": 1980500,
|
||||
"type": "message",
|
||||
"date": "2020-01-01T00:00:04",
|
||||
"from": "Henry",
|
||||
"from_id": 4325636679,
|
||||
"text": "Fireworks!"
|
||||
},
|
||||
{
|
||||
"id": 1980501,
|
||||
"type": "message",
|
||||
"date": "2020-01-01T00:00:05",
|
||||
"from": "Grace 🧤 ðŸ’",
|
||||
"from_id": 4720225552,
|
||||
"text": "You're a minute late!"
|
||||
}
|
||||
]
|
||||
}
|
Loading…
Reference in New Issue
Block a user