diff --git a/docs/extras/integrations/chat_loaders/wechat.ipynb b/docs/extras/integrations/chat_loaders/wechat.ipynb new file mode 100644 index 0000000000..d5d0634572 --- /dev/null +++ b/docs/extras/integrations/chat_loaders/wechat.ipynb @@ -0,0 +1,300 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c4ff9336-1cf3-459e-bd70-d1314c1da6a0", + "metadata": {}, + "source": [ + "# WeChat\n", + "\n", + "There is not yet a straightforward way to export personal WeChat messages. However if you just need no more than few hundrudes of messages for model fine-tuning or few-shot examples, this notebook shows how to create your own chat loader that works on copy-pasted WeChat messages to a list of LangChain messages.\n", + "\n", + "> Highly inspired by https://python.langchain.com/docs/integrations/chat_loaders/discord\n", + "\n", + "\n", + "The process has five steps:\n", + "1. Open your chat in the WeChat desktop app. Select messages you need by mouse-dragging or right-click. Due to restrictions, you can select up to 100 messages once a time. `CMD`/`Ctrl` + `C` to copy.\n", + "2. Create the chat .txt file by pasting selected messages in a file on your local computer.\n", + "3. Copy the chat loader definition from below to a local file.\n", + "4. Initialize the `WeChatChatLoader` with the file path pointed to the text file.\n", + "5. Call `loader.load()` (or `loader.lazy_load()`) to perform the conversion.\n", + "\n", + "## 1. Creat message dump\n", + "\n", + "This loader only supports .txt files in the format generated by copying messages in the app to your clipboard and pasting in a file. Below is an example." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e4ccfdfa-6869-4d67-90a0-ab99f01b7553", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting wechat_chats.txt\n" + ] + } + ], + "source": [ + "%%writefile wechat_chats.txt\n", + "女朋友 2023/09/16 2:51 PM\n", + "天气有点凉\n", + "\n", + "男朋友 2023/09/16 2:51 PM\n", + "珍簟凉风著,瑶琴寄恨生。嵇君懒书札,底物慰秋情。\n", + "\n", + "女朋友 2023/09/16 3:06 PM\n", + "忙什么呢\n", + "\n", + "男朋友 2023/09/16 3:06 PM\n", + "今天只干成了一件像样的事\n", + "那就是想你\n", + "\n", + "女朋友 2023/09/16 3:06 PM\n", + "[动画表情]" + ] + }, + { + "cell_type": "markdown", + "id": "359565a7-dad3-403c-a73c-6414b1295127", + "metadata": {}, + "source": [ + "## 2. Define chat loader\n", + "\n", + "LangChain currently does not support " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "a429e0c4-4d7d-45f8-bbbb-c7fc5229f6af", + "metadata": {}, + "outputs": [], + "source": [ + "import logging\n", + "import re\n", + "from typing import Iterator, List\n", + "\n", + "from langchain import schema\n", + "from langchain.chat_loaders import base as chat_loaders\n", + "\n", + "logger = logging.getLogger()\n", + "\n", + "\n", + "class WeChatChatLoader(chat_loaders.BaseChatLoader):\n", + " \n", + " def __init__(self, path: str):\n", + " \"\"\"\n", + " Initialize the Discord chat loader.\n", + "\n", + " Args:\n", + " path: Path to the exported Discord chat text file.\n", + " \"\"\"\n", + " self.path = path\n", + " self._message_line_regex = re.compile(\n", + " r\"(?P.+?) (?P\\d{4}/\\d{2}/\\d{2} \\d{1,2}:\\d{2} (?:AM|PM))\", # noqa\n", + " # flags=re.DOTALL,\n", + " )\n", + "\n", + " def _append_message_to_results(\n", + " self,\n", + " results: List,\n", + " current_sender: str,\n", + " current_timestamp: str,\n", + " current_content: List[str],\n", + " ):\n", + " content = \"\\n\".join(current_content).strip()\n", + " # skip non-text messages like stickers, images, etc.\n", + " if not re.match(r\"\\[.*\\]\", content):\n", + " results.append(\n", + " schema.HumanMessage(\n", + " content=content,\n", + " additional_kwargs={\n", + " \"sender\": current_sender,\n", + " \"events\": [{\"message_time\": current_timestamp}],\n", + " },\n", + " )\n", + " )\n", + " return results\n", + "\n", + " def _load_single_chat_session_from_txt(\n", + " self, file_path: str\n", + " ) -> chat_loaders.ChatSession:\n", + " \"\"\"\n", + " Load a single chat session from a text file.\n", + "\n", + " Args:\n", + " file_path: Path to the text file containing the chat messages.\n", + "\n", + " Returns:\n", + " A `ChatSession` object containing the loaded chat messages.\n", + " \"\"\"\n", + " with open(file_path, \"r\", encoding=\"utf-8\") as file:\n", + " lines = file.readlines()\n", + "\n", + " results: List[schema.BaseMessage] = []\n", + " current_sender = None\n", + " current_timestamp = None\n", + " current_content = []\n", + " for line in lines:\n", + " if re.match(self._message_line_regex, line):\n", + " if current_sender and current_content:\n", + " results = self._append_message_to_results(\n", + " results, current_sender, current_timestamp, current_content)\n", + " current_sender, current_timestamp = re.match(self._message_line_regex, line).groups()\n", + " current_content = []\n", + " else:\n", + " current_content.append(line.strip())\n", + "\n", + " if current_sender and current_content:\n", + " results = self._append_message_to_results(\n", + " results, current_sender, current_timestamp, current_content)\n", + "\n", + " return chat_loaders.ChatSession(messages=results)\n", + "\n", + " def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:\n", + " \"\"\"\n", + " Lazy load the messages from the chat file and yield them in the required format.\n", + "\n", + " Yields:\n", + " A `ChatSession` object containing the loaded chat messages.\n", + " \"\"\"\n", + " yield self._load_single_chat_session_from_txt(self.path)\n" + ] + }, + { + "cell_type": "markdown", + "id": "c8240393-48be-44d2-b0d6-52c215cd8ac2", + "metadata": {}, + "source": [ + "## 2. Create loader\n", + "\n", + "We will point to the file we just wrote to disk." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "1268de40-b0e5-445d-9cd8-54856cd0293a", + "metadata": {}, + "outputs": [], + "source": [ + "loader = WeChatChatLoader(\n", + " path=\"./wechat_chats.txt\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "4928df4b-ae31-48a7-bd76-be3ecee1f3e0", + "metadata": {}, + "source": [ + "## 3. Load Messages\n", + "\n", + "Assuming the format is correct, the loader will convert the chats to langchain messages." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c8a0836d-4a22-4790-bfe9-97f2145bb0d6", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List\n", + "from langchain.chat_loaders.base import ChatSession\n", + "from langchain.chat_loaders.utils import (\n", + " map_ai_messages,\n", + " merge_chat_runs,\n", + ")\n", + "\n", + "raw_messages = loader.lazy_load()\n", + "# Merge consecutive messages from the same sender into a single message\n", + "merged_messages = merge_chat_runs(raw_messages)\n", + "# Convert messages from \"男朋友\" to AI messages\n", + "messages: List[ChatSession] = list(map_ai_messages(merged_messages, sender=\"男朋友\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1913963b-c44e-4f7a-aba7-0423c9b8bd59", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'messages': [HumanMessage(content='天气有点凉', additional_kwargs={'sender': '女朋友', 'events': [{'message_time': '2023/09/16 2:51 PM'}]}, example=False),\n", + " AIMessage(content='珍簟凉风著,瑶琴寄恨生。嵇君懒书札,底物慰秋情。', additional_kwargs={'sender': '男朋友', 'events': [{'message_time': '2023/09/16 2:51 PM'}]}, example=False),\n", + " HumanMessage(content='忙什么呢', additional_kwargs={'sender': '女朋友', 'events': [{'message_time': '2023/09/16 3:06 PM'}]}, example=False),\n", + " AIMessage(content='今天只干成了一件像样的事\\n那就是想你', additional_kwargs={'sender': '男朋友', 'events': [{'message_time': '2023/09/16 3:06 PM'}]}, example=False)]}]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "messages" + ] + }, + { + "cell_type": "markdown", + "id": "8595a518-5c89-44aa-94a7-ca51e7e2a5fa", + "metadata": {}, + "source": [ + "### Next Steps\n", + "\n", + "You can then use these messages how you see fit, such as finetuning a model, few-shot example selection, or directly make predictions for the next message " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08ff0a1e-fca0-4da3-aacd-d7401f99d946", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "\n", + "llm = ChatOpenAI()\n", + "\n", + "for chunk in llm.stream(messages[0]['messages']):\n", + " print(chunk.content, end=\"\", flush=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50a5251f-074a-4a3c-a2b0-b1de85e0ac6a", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}