mirror of
https://github.com/hwchase17/langchain
synced 2024-10-29 17:07:25 +00:00
dff00ea91e
Still working out interface/notebooks + need discord data dump to test out things other than copy+paste Update: - Going to remove the 'user_id' arg in the loaders themselves and just standardize on putting the "sender" arg in the extra kwargs. Then can provide a utility function to map these to ai and human messages - Going to move the discord one into just a notebook since I don't have a good dump to test on and copy+paste maybe isn't the greatest thing to support in v0 - Need to do more testing on slack since it seems the dump only includes channels and NOT 1 on 1 convos - --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
326 lines
12 KiB
Plaintext
326 lines
12 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "c4ff9336-1cf3-459e-bd70-d1314c1da6a0",
|
|
"metadata": {},
|
|
"source": [
|
|
"# Discord\n",
|
|
"\n",
|
|
"This notebook shows how to create your own chat loader that works on copy-pasted messages (from dms) to a list of LangChain messages.\n",
|
|
"\n",
|
|
"The process has four steps:\n",
|
|
"1. Create the chat .txt file by copying chats from the Discord app and pasting them in a file on your local computer\n",
|
|
"2. Copy the chat loader definition from below to a local file.\n",
|
|
"3. Initialize the `DiscordChatLoader` with the file path pointed to the text file.\n",
|
|
"4. Call `loader.load()` (or `loader.lazy_load()`) to perform the conversion.\n",
|
|
"\n",
|
|
"## 1. Creat message dump\n",
|
|
"\n",
|
|
"Currently (2023/08/23) this loader only supports .txt files in the format generated by copying messages in the app to your clipboard and pasting in a file. Below is an example."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "e4ccfdfa-6869-4d67-90a0-ab99f01b7553",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Overwriting discord_chats.txt\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"%%writefile discord_chats.txt\n",
|
|
"talkingtower — 08/15/2023 11:10 AM\n",
|
|
"Love music! Do you like jazz?\n",
|
|
"reporterbob — 08/15/2023 9:27 PM\n",
|
|
"Yes! Jazz is fantastic. Ever heard this one?\n",
|
|
"Website\n",
|
|
"Listen to classic jazz track...\n",
|
|
"\n",
|
|
"talkingtower — Yesterday at 5:03 AM\n",
|
|
"Indeed! Great choice. 🎷\n",
|
|
"reporterbob — Yesterday at 5:23 AM\n",
|
|
"Thanks! How about some virtual sightseeing?\n",
|
|
"Website\n",
|
|
"Virtual tour of famous landmarks...\n",
|
|
"\n",
|
|
"talkingtower — Today at 2:38 PM\n",
|
|
"Sounds fun! Let's explore.\n",
|
|
"reporterbob — Today at 2:56 PM\n",
|
|
"Enjoy the tour! See you around.\n",
|
|
"talkingtower — Today at 3:00 PM\n",
|
|
"Thank you! Goodbye! 👋\n",
|
|
"reporterbob — Today at 3:02 PM\n",
|
|
"Farewell! Happy exploring."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "359565a7-dad3-403c-a73c-6414b1295127",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 2. Define chat loader\n",
|
|
"\n",
|
|
"LangChain currently does not support "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "a429e0c4-4d7d-45f8-bbbb-c7fc5229f6af",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import logging\n",
|
|
"import re\n",
|
|
"from typing import Iterator, List\n",
|
|
"\n",
|
|
"from langchain import schema\n",
|
|
"from langchain.chat_loaders import base as chat_loaders\n",
|
|
"\n",
|
|
"logger = logging.getLogger()\n",
|
|
"\n",
|
|
"\n",
|
|
"class DiscordChatLoader(chat_loaders.BaseChatLoader):\n",
|
|
" \n",
|
|
" def __init__(self, path: str):\n",
|
|
" \"\"\"\n",
|
|
" Initialize the Discord chat loader.\n",
|
|
"\n",
|
|
" Args:\n",
|
|
" path: Path to the exported Discord chat text file.\n",
|
|
" \"\"\"\n",
|
|
" self.path = path\n",
|
|
" self._message_line_regex = re.compile(\n",
|
|
" r\"(.+?) — (\\w{3,9} \\d{1,2}(?:st|nd|rd|th)?(?:, \\d{4})? \\d{1,2}:\\d{2} (?:AM|PM)|Today at \\d{1,2}:\\d{2} (?:AM|PM)|Yesterday at \\d{1,2}:\\d{2} (?:AM|PM))\", # noqa\n",
|
|
" flags=re.DOTALL,\n",
|
|
" )\n",
|
|
"\n",
|
|
" def _load_single_chat_session_from_txt(\n",
|
|
" self, file_path: str\n",
|
|
" ) -> chat_loaders.ChatSession:\n",
|
|
" \"\"\"\n",
|
|
" Load a single chat session from a text file.\n",
|
|
"\n",
|
|
" Args:\n",
|
|
" file_path: Path to the text file containing the chat messages.\n",
|
|
"\n",
|
|
" Returns:\n",
|
|
" A `ChatSession` object containing the loaded chat messages.\n",
|
|
" \"\"\"\n",
|
|
" with open(file_path, \"r\", encoding=\"utf-8\") as file:\n",
|
|
" lines = file.readlines()\n",
|
|
"\n",
|
|
" results: List[schema.BaseMessage] = []\n",
|
|
" current_sender = None\n",
|
|
" current_timestamp = None\n",
|
|
" current_content = []\n",
|
|
" for line in lines:\n",
|
|
" if re.match(\n",
|
|
" r\".+? — (\\d{2}/\\d{2}/\\d{4} \\d{1,2}:\\d{2} (?:AM|PM)|Today at \\d{1,2}:\\d{2} (?:AM|PM)|Yesterday at \\d{1,2}:\\d{2} (?:AM|PM))\", # noqa\n",
|
|
" line,\n",
|
|
" ):\n",
|
|
" if current_sender and current_content:\n",
|
|
" results.append(\n",
|
|
" schema.HumanMessage(\n",
|
|
" content=\"\".join(current_content).strip(),\n",
|
|
" additional_kwargs={\n",
|
|
" \"sender\": current_sender,\n",
|
|
" \"events\": [{\"message_time\": current_timestamp}],\n",
|
|
" },\n",
|
|
" )\n",
|
|
" )\n",
|
|
" current_sender, current_timestamp = line.split(\" — \")[:2]\n",
|
|
" current_content = [\n",
|
|
" line[len(current_sender) + len(current_timestamp) + 4 :].strip()\n",
|
|
" ]\n",
|
|
" elif re.match(r\"\\[\\d{1,2}:\\d{2} (?:AM|PM)\\]\", line.strip()):\n",
|
|
" results.append(\n",
|
|
" schema.HumanMessage(\n",
|
|
" content=\"\".join(current_content).strip(),\n",
|
|
" additional_kwargs={\n",
|
|
" \"sender\": current_sender,\n",
|
|
" \"events\": [{\"message_time\": current_timestamp}],\n",
|
|
" },\n",
|
|
" )\n",
|
|
" )\n",
|
|
" current_timestamp = line.strip()[1:-1]\n",
|
|
" current_content = []\n",
|
|
" else:\n",
|
|
" current_content.append(\"\\n\" + line.strip())\n",
|
|
"\n",
|
|
" if current_sender and current_content:\n",
|
|
" results.append(\n",
|
|
" schema.HumanMessage(\n",
|
|
" content=\"\".join(current_content).strip(),\n",
|
|
" additional_kwargs={\n",
|
|
" \"sender\": current_sender,\n",
|
|
" \"events\": [{\"message_time\": current_timestamp}],\n",
|
|
" },\n",
|
|
" )\n",
|
|
" )\n",
|
|
"\n",
|
|
" return chat_loaders.ChatSession(messages=results)\n",
|
|
"\n",
|
|
" def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:\n",
|
|
" \"\"\"\n",
|
|
" Lazy load the messages from the chat file and yield them in the required format.\n",
|
|
"\n",
|
|
" Yields:\n",
|
|
" A `ChatSession` object containing the loaded chat messages.\n",
|
|
" \"\"\"\n",
|
|
" yield self._load_single_chat_session_from_txt(self.path)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "c8240393-48be-44d2-b0d6-52c215cd8ac2",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 2. Create loader\n",
|
|
"\n",
|
|
"We will point to the file we just wrote to disk."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "1268de40-b0e5-445d-9cd8-54856cd0293a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"loader = DiscordChatLoader(\n",
|
|
" path=\"./discord_chats.txt\",\n",
|
|
")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "4928df4b-ae31-48a7-bd76-be3ecee1f3e0",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 3. Load Messages\n",
|
|
"\n",
|
|
"Assuming the format is correct, the loader will convert the chats to langchain messages."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "c8a0836d-4a22-4790-bfe9-97f2145bb0d6",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from typing import List\n",
|
|
"from langchain.chat_loaders.base import ChatSession\n",
|
|
"from langchain.chat_loaders.utils import (\n",
|
|
" map_ai_messages,\n",
|
|
" merge_chat_runs,\n",
|
|
")\n",
|
|
"\n",
|
|
"raw_messages = loader.lazy_load()\n",
|
|
"# Merge consecutive messages from the same sender into a single message\n",
|
|
"merged_messages = merge_chat_runs(raw_messages)\n",
|
|
"# Convert messages from \"talkingtower\" to AI messages\n",
|
|
"messages: List[ChatSession] = list(map_ai_messages(merged_messages, sender=\"talkingtower\"))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "1913963b-c44e-4f7a-aba7-0423c9b8bd59",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"[{'messages': [AIMessage(content='Love music! Do you like jazz?', additional_kwargs={'sender': 'talkingtower', 'events': [{'message_time': '08/15/2023 11:10 AM\\n'}]}, example=False),\n",
|
|
" HumanMessage(content='Yes! Jazz is fantastic. Ever heard this one?\\nWebsite\\nListen to classic jazz track...', additional_kwargs={'sender': 'reporterbob', 'events': [{'message_time': '08/15/2023 9:27 PM\\n'}]}, example=False),\n",
|
|
" AIMessage(content='Indeed! Great choice. 🎷', additional_kwargs={'sender': 'talkingtower', 'events': [{'message_time': 'Yesterday at 5:03 AM\\n'}]}, example=False),\n",
|
|
" HumanMessage(content='Thanks! How about some virtual sightseeing?\\nWebsite\\nVirtual tour of famous landmarks...', additional_kwargs={'sender': 'reporterbob', 'events': [{'message_time': 'Yesterday at 5:23 AM\\n'}]}, example=False),\n",
|
|
" AIMessage(content=\"Sounds fun! Let's explore.\", additional_kwargs={'sender': 'talkingtower', 'events': [{'message_time': 'Today at 2:38 PM\\n'}]}, example=False),\n",
|
|
" HumanMessage(content='Enjoy the tour! See you around.', additional_kwargs={'sender': 'reporterbob', 'events': [{'message_time': 'Today at 2:56 PM\\n'}]}, example=False),\n",
|
|
" AIMessage(content='Thank you! Goodbye! 👋', additional_kwargs={'sender': 'talkingtower', 'events': [{'message_time': 'Today at 3:00 PM\\n'}]}, example=False),\n",
|
|
" HumanMessage(content='Farewell! Happy exploring.', additional_kwargs={'sender': 'reporterbob', 'events': [{'message_time': 'Today at 3:02 PM\\n'}]}, example=False)]}]"
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"messages"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "8595a518-5c89-44aa-94a7-ca51e7e2a5fa",
|
|
"metadata": {},
|
|
"source": [
|
|
"### Next Steps\n",
|
|
"\n",
|
|
"You can then use these messages how you see fit, such as finetuning a model, few-shot example selection, or directly make predictions for the next message "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"id": "08ff0a1e-fca0-4da3-aacd-d7401f99d946",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"Thank you! Have a wonderful day! 🌟"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"from langchain.chat_models import ChatOpenAI\n",
|
|
"\n",
|
|
"llm = ChatOpenAI()\n",
|
|
"\n",
|
|
"for chunk in llm.stream(messages[0]['messages']):\n",
|
|
" print(chunk.content, end=\"\", flush=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "50a5251f-074a-4a3c-a2b0-b1de85e0ac6a",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.11.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|