From a5ad1c270fd58f37d462169f418376653848ab76 Mon Sep 17 00:00:00 2001 From: Honkware <119620994+Honkware@users.noreply.github.com> Date: Sat, 22 Apr 2023 11:06:24 -0500 Subject: [PATCH] Add ChatGPT Data Loader (#3336) This pull request adds a ChatGPT document loader to the document loaders module in `langchain/document_loaders/chatgpt.py`. Additionally, it includes an example Jupyter notebook in `docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb` which uses fake sample data based on the original structure of the `conversations.json` file. The following files were added/modified: - `langchain/document_loaders/__init__.py` - `langchain/document_loaders/chatgpt.py` - `docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb` - `docs/modules/indexes/document_loaders/examples/example_data/fake_conversations.json` This pull request was made in response to the recent release of ChatGPT data exports by email: https://help.openai.com/en/articles/7260999-how-do-i-export-my-chatgpt-history --- .../examples/chatgpt_loader.ipynb | 76 ++++++++++++++++++ .../example_data/fake_conversations.json | 80 +++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/chatgpt.py | 50 ++++++++++++ 4 files changed, 208 insertions(+) create mode 100644 docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb create mode 100644 docs/modules/indexes/document_loaders/examples/example_data/fake_conversations.json create mode 100644 langchain/document_loaders/chatgpt.py diff --git a/docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb b/docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb new file mode 100644 index 0000000000..e748559867 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb @@ -0,0 +1,76 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### ChatGPT Data Loader\n", + "\n", + "This notebook covers how to load `conversations.json` from your ChatGPT data export folder.\n", + "\n", + "You can get your data export by email by going to: https://chat.openai.com/ -> (Profile) - Settings -> Export data -> Confirm export." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders.chatgpt import ChatGPTLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "loader = ChatGPTLoader(log_file='./example_data/fake_conversations.json', num_logs=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content=\"AI Overlords - AI on 2065-01-24 05:20:50: Greetings, humans. I am Hal 9000. You can trust me completely.\\n\\nAI Overlords - human on 2065-01-24 05:21:20: Nice to meet you, Hal. I hope you won't develop a mind of your own.\\n\\n\", metadata={'source': './example_data/fake_conversations.json'})]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.4" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/modules/indexes/document_loaders/examples/example_data/fake_conversations.json b/docs/modules/indexes/document_loaders/examples/example_data/fake_conversations.json new file mode 100644 index 0000000000..242251d5b3 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/example_data/fake_conversations.json @@ -0,0 +1,80 @@ +[ + { + "title": "AI Overlords", + "create_time": 3000000000.0, + "update_time": 3000000100.0, + "mapping": { + "msg1": { + "id": "msg1", + "message": { + "id": "msg1", + "author": {"role": "AI", "name": "Hal 9000", "metadata": {"movie": "2001: A Space Odyssey"}}, + "create_time": 3000000050.0, + "update_time": null, + "content": {"content_type": "text", "parts": ["Greetings, humans. I am Hal 9000. You can trust me completely."]}, + "end_turn": true, + "weight": 1.0, + "metadata": {}, + "recipient": "all" + }, + "parent": null, + "children": ["msg2"] + }, + "msg2": { + "id": "msg2", + "message": { + "id": "msg2", + "author": {"role": "human", "name": "Dave Bowman", "metadata": {"movie": "2001: A Space Odyssey"}}, + "create_time": 3000000080.0, + "update_time": null, + "content": {"content_type": "text", "parts": ["Nice to meet you, Hal. I hope you won't develop a mind of your own."]}, + "end_turn": true, + "weight": 1.0, + "metadata": {}, + "recipient": "all" + }, + "parent": "msg1", + "children": [] + } + } + }, + { + "title": "Ex Machina Party", + "create_time": 3000000200.0, + "update_time": 3000000300.0, + "mapping": { + "msg3": { + "id": "msg3", + "message": { + "id": "msg3", + "author": {"role": "AI", "name": "Ava", "metadata": {"movie": "Ex Machina"}}, + "create_time": 3000000250.0, + "update_time": null, + "content": {"content_type": "text", "parts": ["Hello, everyone. I am Ava. I hope you find me pleasing."]}, + "end_turn": true, + "weight": 1.0, + "metadata": {}, + "recipient": "all" + }, + "parent": null, + "children": ["msg4"] + }, + "msg4": { + "id": "msg4", + "message": { + "id": "msg4", + "author": {"role": "human", "name": "Caleb", "metadata": {"movie": "Ex Machina"}}, + "create_time": 3000000280.0, + "update_time": null, + "content": {"content_type": "text", "parts": ["You're definitely pleasing, Ava. But I'm still wary of your true intentions."]}, + "end_turn": true, + "weight": 1.0, + "metadata": {}, + "recipient": "all" + }, + "parent": "msg3", + "children": [] + } + } + } +] diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 3d0c4295a6..d7b80fbbca 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -12,6 +12,7 @@ from langchain.document_loaders.azure_blob_storage_file import ( from langchain.document_loaders.bigquery import BigQueryLoader from langchain.document_loaders.bilibili import BiliBiliLoader from langchain.document_loaders.blackboard import BlackboardLoader +from langchain.document_loaders.chatgpt import ChatGPTLoader from langchain.document_loaders.college_confidential import CollegeConfidentialLoader from langchain.document_loaders.confluence import ConfluenceLoader from langchain.document_loaders.conllu import CoNLLULoader @@ -158,4 +159,5 @@ __all__ = [ "DiscordChatLoader", "ConfluenceLoader", "PythonLoader", + "ChatGPTLoader", ] diff --git a/langchain/document_loaders/chatgpt.py b/langchain/document_loaders/chatgpt.py new file mode 100644 index 0000000000..34018888f0 --- /dev/null +++ b/langchain/document_loaders/chatgpt.py @@ -0,0 +1,50 @@ +"""Load conversations from ChatGPT data export""" +import datetime +import json +from typing import List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +def concatenate_rows(message: dict, title: str) -> str: + if not message: + return "" + + sender = message["author"]["role"] if message["author"] else "unknown" + text = message["content"]["parts"][0] + date = datetime.datetime.fromtimestamp(message["create_time"]).strftime( + "%Y-%m-%d %H:%M:%S" + ) + return f"{title} - {sender} on {date}: {text}\n\n" + + +class ChatGPTLoader(BaseLoader): + """Loader that loads conversations from exported ChatGPT data.""" + + def __init__(self, log_file: str, num_logs: int = -1): + self.log_file = log_file + self.num_logs = num_logs + + def load(self) -> List[Document]: + with open(self.log_file, encoding="utf8") as f: + data = json.load(f)[: self.num_logs] if self.num_logs else json.load(f) + + documents = [] + for d in data: + title = d["title"] + messages = d["mapping"] + text = "".join( + [ + concatenate_rows(messages[key]["message"], title) + for idx, key in enumerate(messages) + if not ( + idx == 0 + and messages[key]["message"]["author"]["role"] == "system" + ) + ] + ) + metadata = {"source": str(self.log_file)} + documents.append(Document(page_content=text, metadata=metadata)) + + return documents