From 2b1403612614127da4e3bd3d22595ce7b3eb1540 Mon Sep 17 00:00:00 2001 From: Rukmani Date: Tue, 9 May 2023 15:00:04 -0700 Subject: [PATCH] Update WhatsAppChatLoader to include the character ~ in the sender name (#4420) Fixes #4153 If the sender of a message in a group chat isn't in your contact list, they will appear with a ~ prefix in the exported chat. This PR adds support for parsing such lines. --- langchain/document_loaders/whatsapp_chat.py | 2 +- tests/integration_tests/document_loaders/test_whatsapp_chat.py | 1 + tests/integration_tests/examples/whatsapp_chat.txt | 3 ++- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/langchain/document_loaders/whatsapp_chat.py b/langchain/document_loaders/whatsapp_chat.py index 4cbac88c..19832e74 100644 --- a/langchain/document_loaders/whatsapp_chat.py +++ b/langchain/document_loaders/whatsapp_chat.py @@ -44,7 +44,7 @@ class WhatsAppChatLoader(BaseLoader): ) \]? [\s-]* - ([\w\s]+) + ([~\w\s]+) [:]+ \s (.+) diff --git a/tests/integration_tests/document_loaders/test_whatsapp_chat.py b/tests/integration_tests/document_loaders/test_whatsapp_chat.py index 788c4099..52394fd0 100644 --- a/tests/integration_tests/document_loaders/test_whatsapp_chat.py +++ b/tests/integration_tests/document_loaders/test_whatsapp_chat.py @@ -16,4 +16,5 @@ def test_whatsapp_chat_loader() -> None: "User name on 11/8/21, 9:41:32 AM: Message 123\n\n" "User 2 on 1/23/23, 3:19 AM: Bye!\n\n" "User 1 on 1/23/23, 3:22_AM: And let me know if anything changes\n\n" + "~ User name 2 on 1/24/21, 12:41:03 PM: Of course!\n\n" ) diff --git a/tests/integration_tests/examples/whatsapp_chat.txt b/tests/integration_tests/examples/whatsapp_chat.txt index 402df938..785b8b16 100644 --- a/tests/integration_tests/examples/whatsapp_chat.txt +++ b/tests/integration_tests/examples/whatsapp_chat.txt @@ -1,4 +1,5 @@ [05.05.23, 15:48:11] James: Hi here [11/8/21, 9:41:32 AM] User name: Message 123 1/23/23, 3:19 AM - User 2: Bye! -1/23/23, 3:22_AM - User 1: And let me know if anything changes \ No newline at end of file +1/23/23, 3:22_AM - User 1: And let me know if anything changes +[1/24/21, 12:41:03 PM] ~ User name 2: Of course! \ No newline at end of file