From 374725a715d287fe2ddb9dfda36e0dc14efa254d Mon Sep 17 00:00:00 2001 From: hp0404 <39014459+hp0404@users.noreply.github.com> Date: Thu, 4 May 2023 01:59:19 +0300 Subject: [PATCH] Refactor TelegramChatLoader and FacebookChatLoader classes and add tests (#3863) This PR includes two main changes: - Refactor the `TelegramChatLoader` and `FacebookChatLoader` classes by removing the dependency on pandas and simplifying the message filtering process. - Add test cases for the `TelegramChatLoader` and `FacebookChatLoader` classes. This test ensures that the class correctly loads and processes the example chat data, providing better test coverage for this functionality. --- langchain/document_loaders/facebook_chat.py | 25 ++------ langchain/document_loaders/telegram.py | 25 ++------ .../document_loaders/test_facebook_chat.py | 28 ++++++++ .../document_loaders/test_telegram.py | 18 ++++++ .../examples/facebook_chat.json | 64 +++++++++++++++++++ .../integration_tests/examples/telegram.json | 31 +++++++++ 6 files changed, 151 insertions(+), 40 deletions(-) create mode 100644 tests/integration_tests/document_loaders/test_facebook_chat.py create mode 100644 tests/integration_tests/document_loaders/test_telegram.py create mode 100644 tests/integration_tests/examples/facebook_chat.json create mode 100644 tests/integration_tests/examples/telegram.json diff --git a/langchain/document_loaders/facebook_chat.py b/langchain/document_loaders/facebook_chat.py index d2dec9f0..40636198 100644 --- a/langchain/document_loaders/facebook_chat.py +++ b/langchain/document_loaders/facebook_chat.py @@ -27,31 +27,16 @@ class FacebookChatLoader(BaseLoader): def load(self) -> List[Document]: """Load documents.""" - try: - import pandas as pd - except ImportError: - raise ValueError( - "pandas is needed for Facebook chat loader, " - "please install with `pip install pandas`" - ) p = Path(self.file_path) with open(p, encoding="utf8") as f: d = json.load(f) - normalized_messages = pd.json_normalize(d["messages"]) - df_normalized_messages = pd.DataFrame(normalized_messages) - - # Only keep plain text messages - # (no services, nor links, hashtags, code, bold ...) - df_filtered = df_normalized_messages[ - (df_normalized_messages.content.apply(lambda x: type(x) == str)) - ] - - df_filtered = df_filtered[["timestamp_ms", "content", "sender_name"]] - - text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep="") - + text = "".join( + concatenate_rows(message) + for message in d["messages"] + if message.get("content") and isinstance(message["content"], str) + ) metadata = {"source": str(p)} return [Document(page_content=text, metadata=metadata)] diff --git a/langchain/document_loaders/telegram.py b/langchain/document_loaders/telegram.py index 07f571d7..db304095 100644 --- a/langchain/document_loaders/telegram.py +++ b/langchain/document_loaders/telegram.py @@ -24,31 +24,16 @@ class TelegramChatLoader(BaseLoader): def load(self) -> List[Document]: """Load documents.""" - try: - import pandas as pd - except ImportError: - raise ValueError( - "pandas is needed for Telegram loader, " - "please install with `pip install pandas`" - ) p = Path(self.file_path) with open(p, encoding="utf8") as f: d = json.load(f) - normalized_messages = pd.json_normalize(d["messages"]) - df_normalized_messages = pd.DataFrame(normalized_messages) - - # Only keep plain text messages (no services, links, hashtags, code, bold...) - df_filtered = df_normalized_messages[ - (df_normalized_messages.type == "message") - & (df_normalized_messages.text.apply(lambda x: type(x) == str)) - ] - - df_filtered = df_filtered[["date", "text", "from"]] - - text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep="") - + text = "".join( + concatenate_rows(message) + for message in d["messages"] + if message["type"] == "message" and isinstance(message["text"], str) + ) metadata = {"source": str(p)} return [Document(page_content=text, metadata=metadata)] diff --git a/tests/integration_tests/document_loaders/test_facebook_chat.py b/tests/integration_tests/document_loaders/test_facebook_chat.py new file mode 100644 index 00000000..eaa8f912 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_facebook_chat.py @@ -0,0 +1,28 @@ +from pathlib import Path + +from langchain.document_loaders import FacebookChatLoader + + +def test_facebook_chat_loader() -> None: + """Test FacebookChatLoader.""" + file_path = Path(__file__).parent.parent / "examples/facebook_chat.json" + loader = FacebookChatLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 1 + assert docs[0].metadata["source"] == str(file_path) + assert docs[0].page_content == ( + "User 2 on 2023-02-05 13:46:11: Bye!\n\n" + "User 1 on 2023-02-05 13:43:55: Oh no worries! Bye\n\n" + "User 2 on 2023-02-05 13:24:37: No Im sorry it was my mistake, " + "the blue one is not for sale\n\n" + "User 1 on 2023-02-05 13:05:40: I thought you were selling the blue one!\n\n" + "User 1 on 2023-02-05 13:05:09: Im not interested in this bag. " + "Im interested in the blue one!\n\n" + "User 2 on 2023-02-05 13:04:28: Here is $129\n\n" + "User 2 on 2023-02-05 13:04:05: Online is at least $100\n\n" + "User 1 on 2023-02-05 12:59:59: How much do you want?\n\n" + "User 2 on 2023-02-05 08:17:56: Goodmorning! $50 is too low.\n\n" + "User 1 on 2023-02-05 00:17:02: Hi! Im interested in your bag. " + "Im offering $50. Let me know if you are interested. Thanks!\n\n" + ) diff --git a/tests/integration_tests/document_loaders/test_telegram.py b/tests/integration_tests/document_loaders/test_telegram.py new file mode 100644 index 00000000..05e2f051 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_telegram.py @@ -0,0 +1,18 @@ +from pathlib import Path + +from langchain.document_loaders import TelegramChatLoader + + +def test_telegram_chat_loader() -> None: + """Test TelegramChatLoader.""" + file_path = Path(__file__).parent.parent / "examples/telegram.json" + loader = TelegramChatLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 1 + assert docs[0].metadata["source"] == str(file_path) + assert docs[0].page_content == ( + "Henry on 2020-01-01T00:00:02: It's 2020...\n\n" + "Henry on 2020-01-01T00:00:04: Fireworks!\n\n" + "Grace 🧤 ðŸ\x8d’ on 2020-01-01T00:00:05: You're a minute late!\n\n" + ) diff --git a/tests/integration_tests/examples/facebook_chat.json b/tests/integration_tests/examples/facebook_chat.json new file mode 100644 index 00000000..68c9c0c2 --- /dev/null +++ b/tests/integration_tests/examples/facebook_chat.json @@ -0,0 +1,64 @@ +{ + "participants": [{"name": "User 1"}, {"name": "User 2"}], + "messages": [ + {"sender_name": "User 2", "timestamp_ms": 1675597571851, "content": "Bye!"}, + { + "sender_name": "User 1", + "timestamp_ms": 1675597435669, + "content": "Oh no worries! Bye" + }, + { + "sender_name": "User 2", + "timestamp_ms": 1675596277579, + "content": "No Im sorry it was my mistake, the blue one is not for sale" + }, + { + "sender_name": "User 1", + "timestamp_ms": 1675595140251, + "content": "I thought you were selling the blue one!" + }, + { + "sender_name": "User 1", + "timestamp_ms": 1675595109305, + "content": "Im not interested in this bag. Im interested in the blue one!" + }, + { + "sender_name": "User 2", + "timestamp_ms": 1675595068468, + "content": "Here is $129" + }, + { + "sender_name": "User 2", + "timestamp_ms": 1675595060730, + "photos": [ + {"uri": "url_of_some_picture.jpg", "creation_timestamp": 1675595059} + ] + }, + { + "sender_name": "User 2", + "timestamp_ms": 1675595045152, + "content": "Online is at least $100" + }, + { + "sender_name": "User 1", + "timestamp_ms": 1675594799696, + "content": "How much do you want?" + }, + { + "sender_name": "User 2", + "timestamp_ms": 1675577876645, + "content": "Goodmorning! $50 is too low." + }, + { + "sender_name": "User 1", + "timestamp_ms": 1675549022673, + "content": "Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks!" + } + ], + "title": "User 1 and User 2 chat", + "is_still_participant": true, + "thread_path": "inbox/User 1 and User 2 chat", + "magic_words": [], + "image": {"uri": "image_of_the_chat.jpg", "creation_timestamp": 1675549016}, + "joinable_mode": {"mode": 1, "link": ""} +} diff --git a/tests/integration_tests/examples/telegram.json b/tests/integration_tests/examples/telegram.json new file mode 100644 index 00000000..f290a8a5 --- /dev/null +++ b/tests/integration_tests/examples/telegram.json @@ -0,0 +1,31 @@ +{ + "name": "Grace 🧤", + "type": "personal_chat", + "id": 2730825451, + "messages": [ + { + "id": 1980499, + "type": "message", + "date": "2020-01-01T00:00:02", + "from": "Henry", + "from_id": 4325636679, + "text": "It's 2020..." + }, + { + "id": 1980500, + "type": "message", + "date": "2020-01-01T00:00:04", + "from": "Henry", + "from_id": 4325636679, + "text": "Fireworks!" + }, + { + "id": 1980501, + "type": "message", + "date": "2020-01-01T00:00:05", + "from": "Grace 🧤 🍒", + "from_id": 4720225552, + "text": "You're a minute late!" + } + ] + } \ No newline at end of file