mirror of
https://github.com/hwchase17/langchain
synced 2024-10-29 17:07:25 +00:00
2a3c5f8353
This PR updates the `message_line_regex` used by `WhatsAppChatLoader` to support different date-time formats used in WhatsApp chat exports; resolves #4153. The new regex handles the following input formats: ```terminal [05.05.23, 15:48:11] James: Hi here [11/8/21, 9:41:32 AM] User name: Message 123 1/23/23, 3:19 AM - User 2: Bye! 1/23/23, 3:22_AM - User 1: And let me know if anything changes ``` Tests have been added to verify that the loader works correctly with all formats.
20 lines
677 B
Python
20 lines
677 B
Python
from pathlib import Path
|
|
|
|
from langchain.document_loaders import WhatsAppChatLoader
|
|
|
|
|
|
def test_whatsapp_chat_loader() -> None:
|
|
"""Test WhatsAppChatLoader."""
|
|
file_path = Path(__file__).parent.parent / "examples" / "whatsapp_chat.txt"
|
|
loader = WhatsAppChatLoader(str(file_path))
|
|
docs = loader.load()
|
|
|
|
assert len(docs) == 1
|
|
assert docs[0].metadata["source"] == str(file_path)
|
|
assert docs[0].page_content == (
|
|
"James on 05.05.23, 15:48:11: Hi here\n\n"
|
|
"User name on 11/8/21, 9:41:32 AM: Message 123\n\n"
|
|
"User 2 on 1/23/23, 3:19 AM: Bye!\n\n"
|
|
"User 1 on 1/23/23, 3:22_AM: And let me know if anything changes\n\n"
|
|
)
|