From 2a3c5f83537817d06ea8fad2836bbcd1cb33a551 Mon Sep 17 00:00:00 2001 From: hp0404 <39014459+hp0404@users.noreply.github.com> Date: Fri, 5 May 2023 23:13:05 +0300 Subject: [PATCH] Update WhatsAppChatLoader regex to handle multiple date-time formats (#4186) This PR updates the `message_line_regex` used by `WhatsAppChatLoader` to support different date-time formats used in WhatsApp chat exports; resolves #4153. The new regex handles the following input formats: ```terminal [05.05.23, 15:48:11] James: Hi here [11/8/21, 9:41:32 AM] User name: Message 123 1/23/23, 3:19 AM - User 2: Bye! 1/23/23, 3:22_AM - User 1: And let me know if anything changes ``` Tests have been added to verify that the loader works correctly with all formats. --- langchain/document_loaders/whatsapp_chat.py | 33 ++++++++++++++----- .../document_loaders/test_whatsapp_chat.py | 19 +++++++++++ .../examples/whatsapp_chat.txt | 4 +++ 3 files changed, 47 insertions(+), 9 deletions(-) create mode 100644 tests/integration_tests/document_loaders/test_whatsapp_chat.py create mode 100644 tests/integration_tests/examples/whatsapp_chat.txt diff --git a/langchain/document_loaders/whatsapp_chat.py b/langchain/document_loaders/whatsapp_chat.py index 2bd1c04a..4cbac88c 100644 --- a/langchain/document_loaders/whatsapp_chat.py +++ b/langchain/document_loaders/whatsapp_chat.py @@ -26,16 +26,31 @@ class WhatsAppChatLoader(BaseLoader): with open(p, encoding="utf8") as f: lines = f.readlines() - message_line_regex = ( - r"(\d{1,2}/\d{1,2}/\d{2,4}, " - r"\d{1,2}:\d{1,2}[ _]?(?:AM|PM)?) - " - r"(.*?): (.*)" - ) - for line in lines: - result = re.match( - message_line_regex, - line.strip(), + message_line_regex = r""" + \[? + ( + \d{1,2} + [\/.] + \d{1,2} + [\/.] + \d{2,4} + ,\s + \d{1,2} + :\d{2} + (?: + :\d{2} + )? + (?:[ _](?:AM|PM))? ) + \]? + [\s-]* + ([\w\s]+) + [:]+ + \s + (.+) + """ + for line in lines: + result = re.match(message_line_regex, line.strip(), flags=re.VERBOSE) if result: date, sender, text = result.groups() text_content += concatenate_rows(date, sender, text) diff --git a/tests/integration_tests/document_loaders/test_whatsapp_chat.py b/tests/integration_tests/document_loaders/test_whatsapp_chat.py new file mode 100644 index 00000000..788c4099 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_whatsapp_chat.py @@ -0,0 +1,19 @@ +from pathlib import Path + +from langchain.document_loaders import WhatsAppChatLoader + + +def test_whatsapp_chat_loader() -> None: + """Test WhatsAppChatLoader.""" + file_path = Path(__file__).parent.parent / "examples" / "whatsapp_chat.txt" + loader = WhatsAppChatLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 1 + assert docs[0].metadata["source"] == str(file_path) + assert docs[0].page_content == ( + "James on 05.05.23, 15:48:11: Hi here\n\n" + "User name on 11/8/21, 9:41:32 AM: Message 123\n\n" + "User 2 on 1/23/23, 3:19 AM: Bye!\n\n" + "User 1 on 1/23/23, 3:22_AM: And let me know if anything changes\n\n" + ) diff --git a/tests/integration_tests/examples/whatsapp_chat.txt b/tests/integration_tests/examples/whatsapp_chat.txt new file mode 100644 index 00000000..402df938 --- /dev/null +++ b/tests/integration_tests/examples/whatsapp_chat.txt @@ -0,0 +1,4 @@ +[05.05.23, 15:48:11] James: Hi here +[11/8/21, 9:41:32 AM] User name: Message 123 +1/23/23, 3:19 AM - User 2: Bye! +1/23/23, 3:22_AM - User 1: And let me know if anything changes \ No newline at end of file