diff --git a/langchain/document_loaders/whatsapp_chat.py b/langchain/document_loaders/whatsapp_chat.py index 2bd1c04a..4cbac88c 100644 --- a/langchain/document_loaders/whatsapp_chat.py +++ b/langchain/document_loaders/whatsapp_chat.py @@ -26,16 +26,31 @@ class WhatsAppChatLoader(BaseLoader): with open(p, encoding="utf8") as f: lines = f.readlines() - message_line_regex = ( - r"(\d{1,2}/\d{1,2}/\d{2,4}, " - r"\d{1,2}:\d{1,2}[ _]?(?:AM|PM)?) - " - r"(.*?): (.*)" - ) - for line in lines: - result = re.match( - message_line_regex, - line.strip(), + message_line_regex = r""" + \[? + ( + \d{1,2} + [\/.] + \d{1,2} + [\/.] + \d{2,4} + ,\s + \d{1,2} + :\d{2} + (?: + :\d{2} + )? + (?:[ _](?:AM|PM))? ) + \]? + [\s-]* + ([\w\s]+) + [:]+ + \s + (.+) + """ + for line in lines: + result = re.match(message_line_regex, line.strip(), flags=re.VERBOSE) if result: date, sender, text = result.groups() text_content += concatenate_rows(date, sender, text) diff --git a/tests/integration_tests/document_loaders/test_whatsapp_chat.py b/tests/integration_tests/document_loaders/test_whatsapp_chat.py new file mode 100644 index 00000000..788c4099 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_whatsapp_chat.py @@ -0,0 +1,19 @@ +from pathlib import Path + +from langchain.document_loaders import WhatsAppChatLoader + + +def test_whatsapp_chat_loader() -> None: + """Test WhatsAppChatLoader.""" + file_path = Path(__file__).parent.parent / "examples" / "whatsapp_chat.txt" + loader = WhatsAppChatLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 1 + assert docs[0].metadata["source"] == str(file_path) + assert docs[0].page_content == ( + "James on 05.05.23, 15:48:11: Hi here\n\n" + "User name on 11/8/21, 9:41:32 AM: Message 123\n\n" + "User 2 on 1/23/23, 3:19 AM: Bye!\n\n" + "User 1 on 1/23/23, 3:22_AM: And let me know if anything changes\n\n" + ) diff --git a/tests/integration_tests/examples/whatsapp_chat.txt b/tests/integration_tests/examples/whatsapp_chat.txt new file mode 100644 index 00000000..402df938 --- /dev/null +++ b/tests/integration_tests/examples/whatsapp_chat.txt @@ -0,0 +1,4 @@ +[05.05.23, 15:48:11] James: Hi here +[11/8/21, 9:41:32 AM] User name: Message 123 +1/23/23, 3:19 AM - User 2: Bye! +1/23/23, 3:22_AM - User 1: And let me know if anything changes \ No newline at end of file