diff --git a/langchain/document_loaders/whatsapp_chat.py b/langchain/document_loaders/whatsapp_chat.py index c2ffc46ee5..b3022a555b 100644 --- a/langchain/document_loaders/whatsapp_chat.py +++ b/langchain/document_loaders/whatsapp_chat.py @@ -49,13 +49,15 @@ class WhatsAppChatLoader(BaseLoader): \s (.+) """ + ignore_lines = ["This message was deleted", ""] for line in lines: result = re.match( message_line_regex, line.strip(), flags=re.VERBOSE | re.IGNORECASE ) if result: date, sender, text = result.groups() - text_content += concatenate_rows(date, sender, text) + if text not in ignore_lines: + text_content += concatenate_rows(date, sender, text) metadata = {"source": str(p)} diff --git a/tests/integration_tests/examples/whatsapp_chat.txt b/tests/integration_tests/examples/whatsapp_chat.txt index bdd4d63fd0..605af130f2 100644 --- a/tests/integration_tests/examples/whatsapp_chat.txt +++ b/tests/integration_tests/examples/whatsapp_chat.txt @@ -6,3 +6,5 @@ [2023/5/4, 16:13:23] ~ User 2: See you! 7/19/22, 11:32 PM - User 1: Hello 7/20/22, 11:32 am - User 2: Goodbye +4/20/23, 9:42 am - User 3: +6/29/23, 12:16 am - User 4: This message was deleted