From 7688bf91829c95a861b911fa64d197bf20d1ea2c Mon Sep 17 00:00:00 2001 From: Rounak Datta Date: Thu, 13 Apr 2023 22:18:32 +0530 Subject: [PATCH] WhatsApp document loader - update regex (#2776) I was testing out the WhatsApp Document loader, and noticed that sometimes the date is of the following format (notice the additional underscore): ``` 3/24/23, 1:54_PM - +91 99999 99999 joined using this group's invite link 3/24/23, 6:29_PM - +91 99999 99999: When are we starting then? ``` Wierdly, the underscore is visible in Vim, but not on editors like VSCode. I presume it is some unusual character/line terminator. Nevertheless, I think handling this edge case will make the document loader more robust. --- .../examples/example_data/whatsapp_chat.txt | 3 ++- langchain/document_loaders/whatsapp_chat.py | 7 ++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/modules/indexes/document_loaders/examples/example_data/whatsapp_chat.txt b/docs/modules/indexes/document_loaders/examples/example_data/whatsapp_chat.txt index 9c88f9bea2..acbe2953e9 100644 --- a/docs/modules/indexes/document_loaders/examples/example_data/whatsapp_chat.txt +++ b/docs/modules/indexes/document_loaders/examples/example_data/whatsapp_chat.txt @@ -8,4 +8,5 @@ 1/23/23, 3:02 AM - User 1: I thought you were selling the blue one! 1/23/23, 3:18 AM - User 2: No Im sorry it was my mistake, the blue one is not for sale 1/23/23, 3:19 AM - User 1: Oh no worries! Bye -1/23/23, 3:19 AM - User 2: Bye! \ No newline at end of file +1/23/23, 3:19 AM - User 2: Bye! +1/23/23, 3:22_AM - User 1: And let me know if anything changes \ No newline at end of file diff --git a/langchain/document_loaders/whatsapp_chat.py b/langchain/document_loaders/whatsapp_chat.py index bbe12a6269..2bd1c04a17 100644 --- a/langchain/document_loaders/whatsapp_chat.py +++ b/langchain/document_loaders/whatsapp_chat.py @@ -26,9 +26,14 @@ class WhatsAppChatLoader(BaseLoader): with open(p, encoding="utf8") as f: lines = f.readlines() + message_line_regex = ( + r"(\d{1,2}/\d{1,2}/\d{2,4}, " + r"\d{1,2}:\d{1,2}[ _]?(?:AM|PM)?) - " + r"(.*?): (.*)" + ) for line in lines: result = re.match( - r"(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{1,2}(?: AM| PM)?) - (.*?): (.*)", + message_line_regex, line.strip(), ) if result: