diff --git a/docs/modules/indexes/document_loaders/examples/example_data/whatsapp_chat.txt b/docs/modules/indexes/document_loaders/examples/example_data/whatsapp_chat.txt index 9c88f9be..acbe2953 100644 --- a/docs/modules/indexes/document_loaders/examples/example_data/whatsapp_chat.txt +++ b/docs/modules/indexes/document_loaders/examples/example_data/whatsapp_chat.txt @@ -8,4 +8,5 @@ 1/23/23, 3:02 AM - User 1: I thought you were selling the blue one! 1/23/23, 3:18 AM - User 2: No Im sorry it was my mistake, the blue one is not for sale 1/23/23, 3:19 AM - User 1: Oh no worries! Bye -1/23/23, 3:19 AM - User 2: Bye! \ No newline at end of file +1/23/23, 3:19 AM - User 2: Bye! +1/23/23, 3:22_AM - User 1: And let me know if anything changes \ No newline at end of file diff --git a/langchain/document_loaders/whatsapp_chat.py b/langchain/document_loaders/whatsapp_chat.py index bbe12a62..2bd1c04a 100644 --- a/langchain/document_loaders/whatsapp_chat.py +++ b/langchain/document_loaders/whatsapp_chat.py @@ -26,9 +26,14 @@ class WhatsAppChatLoader(BaseLoader): with open(p, encoding="utf8") as f: lines = f.readlines() + message_line_regex = ( + r"(\d{1,2}/\d{1,2}/\d{2,4}, " + r"\d{1,2}:\d{1,2}[ _]?(?:AM|PM)?) - " + r"(.*?): (.*)" + ) for line in lines: result = re.match( - r"(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{1,2}(?: AM| PM)?) - (.*?): (.*)", + message_line_regex, line.strip(), ) if result: