From d5825bd3e8805941d1ed38d68dc7933eca8f6d9d Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Mon, 27 Mar 2023 23:43:45 -0700 Subject: [PATCH] Harrison/whatsapp loader (#2085) Co-authored-by: Moshe --- .../examples/example_data/whatsapp_chat.txt | 11 +++ .../examples/whatsapp_chat.ipynb | 67 +++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/whatsapp_chat.py | 40 +++++++++++ 4 files changed, 120 insertions(+) create mode 100644 docs/modules/indexes/document_loaders/examples/example_data/whatsapp_chat.txt create mode 100644 docs/modules/indexes/document_loaders/examples/whatsapp_chat.ipynb create mode 100644 langchain/document_loaders/whatsapp_chat.py diff --git a/docs/modules/indexes/document_loaders/examples/example_data/whatsapp_chat.txt b/docs/modules/indexes/document_loaders/examples/example_data/whatsapp_chat.txt new file mode 100644 index 00000000..9c88f9be --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/example_data/whatsapp_chat.txt @@ -0,0 +1,11 @@ +1/22/23, 6:30 PM - User 1: Hi! Im interested in your bag. Im offering $50. Let me know if you are interested. Thanks! +1/22/23, 8:24 PM - User 2: Goodmorning! $50 is too low. +1/23/23, 2:59 AM - User 1: How much do you want? +1/23/23, 3:00 AM - User 2: Online is at least $100 +1/23/23, 3:01 AM - User 2: Here is $129 +1/23/23, 3:01 AM - User 2: +1/23/23, 3:01 AM - User 1: Im not interested in this bag. Im interested in the blue one! +1/23/23, 3:02 AM - User 1: I thought you were selling the blue one! +1/23/23, 3:18 AM - User 2: No Im sorry it was my mistake, the blue one is not for sale +1/23/23, 3:19 AM - User 1: Oh no worries! Bye +1/23/23, 3:19 AM - User 2: Bye! \ No newline at end of file diff --git a/docs/modules/indexes/document_loaders/examples/whatsapp_chat.ipynb b/docs/modules/indexes/document_loaders/examples/whatsapp_chat.ipynb new file mode 100644 index 00000000..0744773e --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/whatsapp_chat.ipynb @@ -0,0 +1,67 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### WhatsApp Chat\n", + "\n", + "This notebook covers how to load data from the WhatsApp Chats into a format that can be ingested into LangChain." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import WhatsAppChatLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "loader = WhatsAppChatLoader(\"example_data/whatsapp_chat.txt\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.1" + }, + "vscode": { + "interpreter": { + "hash": "384707f4965e853a82006e90614c2e1a578ea1f6eb0ee07a1dd78a657d37dd67" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 43420ef2..04837230 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -55,6 +55,7 @@ from langchain.document_loaders.unstructured import ( ) from langchain.document_loaders.url import UnstructuredURLLoader from langchain.document_loaders.web_base import WebBaseLoader +from langchain.document_loaders.whatsapp_chat import WhatsAppChatLoader from langchain.document_loaders.word_document import UnstructuredWordDocumentLoader from langchain.document_loaders.youtube import ( GoogleApiClient, @@ -113,6 +114,7 @@ __all__ = [ "GoogleApiClient", "CSVLoader", "BlackboardLoader", + "WhatsAppChatLoader", "DataFrameLoader", "AzureBlobStorageFileLoader", "AzureBlobStorageContainerLoader", diff --git a/langchain/document_loaders/whatsapp_chat.py b/langchain/document_loaders/whatsapp_chat.py new file mode 100644 index 00000000..e7eeeace --- /dev/null +++ b/langchain/document_loaders/whatsapp_chat.py @@ -0,0 +1,40 @@ +import re +from pathlib import Path +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +def concatenate_rows(date: str, sender: str, text: str) -> str: + """Combine message information in a readable format ready to be used.""" + return f"{sender} on {date}: {text}\n\n" + + +class WhatsAppChatLoader(BaseLoader): + """Loader that loads WhatsApp messages text file.""" + + def __init__(self, path: str): + """Initialize with path.""" + self.file_path = path + + def load(self) -> List[Document]: + """Load documents.""" + p = Path(self.file_path) + text_content = "" + + with open(p, encoding="utf8") as f: + lines = f.readlines() + + for line in lines: + result = re.match( + r"(\d{1,2}/\d{1,2}/\d{2,4}, \d{1,2}:\d{1,2} (?:AM|PM)) - (.*?): (.*)", + line.strip(), + ) + if result: + date, sender, text = result.groups() + text_content += concatenate_rows(date, sender, text) + + metadata = {"source": str(p)} + + return [Document(page_content=text_content, metadata=metadata)]