From a5ad1c270fd58f37d462169f418376653848ab76 Mon Sep 17 00:00:00 2001
From: Honkware <119620994+Honkware@users.noreply.github.com>
Date: Sat, 22 Apr 2023 11:06:24 -0500
Subject: [PATCH] Add ChatGPT Data Loader (#3336)

This pull request adds a ChatGPT document loader to the document loaders
module in `langchain/document_loaders/chatgpt.py`. Additionally, it
includes an example Jupyter notebook in
`docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb`
which uses fake sample data based on the original structure of the
`conversations.json` file.

The following files were added/modified:
- `langchain/document_loaders/__init__.py`
- `langchain/document_loaders/chatgpt.py`
- `docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb`
-
`docs/modules/indexes/document_loaders/examples/example_data/fake_conversations.json`

This pull request was made in response to the recent release of ChatGPT
data exports by email:
https://help.openai.com/en/articles/7260999-how-do-i-export-my-chatgpt-history
---
 .../examples/chatgpt_loader.ipynb             | 76 ++++++++++++++++++
 .../example_data/fake_conversations.json      | 80 +++++++++++++++++++
 langchain/document_loaders/__init__.py        |  2 +
 langchain/document_loaders/chatgpt.py         | 50 ++++++++++++
 4 files changed, 208 insertions(+)
 create mode 100644 docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb
 create mode 100644 docs/modules/indexes/document_loaders/examples/example_data/fake_conversations.json
 create mode 100644 langchain/document_loaders/chatgpt.py

diff --git a/docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb b/docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb
new file mode 100644
index 0000000000..e748559867
--- /dev/null
+++ b/docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb
@@ -0,0 +1,76 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### ChatGPT Data Loader\n",
+    "\n",
+    "This notebook covers how to load `conversations.json` from your ChatGPT data export folder.\n",
+    "\n",
+    "You can get your data export by email by going to: https://chat.openai.com/ -> (Profile) - Settings -> Export data -> Confirm export."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders.chatgpt import ChatGPTLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = ChatGPTLoader(log_file='./example_data/fake_conversations.json', num_logs=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content=\"AI Overlords - AI on 2065-01-24 05:20:50: Greetings, humans. I am Hal 9000. You can trust me completely.\\n\\nAI Overlords - human on 2065-01-24 05:21:20: Nice to meet you, Hal. I hope you won't develop a mind of your own.\\n\\n\", metadata={'source': './example_data/fake_conversations.json'})]"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "loader.load()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/modules/indexes/document_loaders/examples/example_data/fake_conversations.json b/docs/modules/indexes/document_loaders/examples/example_data/fake_conversations.json
new file mode 100644
index 0000000000..242251d5b3
--- /dev/null
+++ b/docs/modules/indexes/document_loaders/examples/example_data/fake_conversations.json
@@ -0,0 +1,80 @@
+[
+    {
+        "title": "AI Overlords",
+        "create_time": 3000000000.0,
+        "update_time": 3000000100.0,
+        "mapping": {
+            "msg1": {
+                "id": "msg1",
+                "message": {
+                    "id": "msg1",
+                    "author": {"role": "AI", "name": "Hal 9000", "metadata": {"movie": "2001: A Space Odyssey"}},
+                    "create_time": 3000000050.0,
+                    "update_time": null,
+                    "content": {"content_type": "text", "parts": ["Greetings, humans. I am Hal 9000. You can trust me completely."]},
+                    "end_turn": true,
+                    "weight": 1.0,
+                    "metadata": {},
+                    "recipient": "all"
+                },
+                "parent": null,
+                "children": ["msg2"]
+            },
+            "msg2": {
+                "id": "msg2",
+                "message": {
+                    "id": "msg2",
+                    "author": {"role": "human", "name": "Dave Bowman", "metadata": {"movie": "2001: A Space Odyssey"}},
+                    "create_time": 3000000080.0,
+                    "update_time": null,
+                    "content": {"content_type": "text", "parts": ["Nice to meet you, Hal. I hope you won't develop a mind of your own."]},
+                    "end_turn": true,
+                    "weight": 1.0,
+                    "metadata": {},
+                    "recipient": "all"
+                },
+                "parent": "msg1",
+                "children": []
+            }
+        }
+    },
+    {
+        "title": "Ex Machina Party",
+        "create_time": 3000000200.0,
+        "update_time": 3000000300.0,
+        "mapping": {
+            "msg3": {
+                "id": "msg3",
+                "message": {
+                    "id": "msg3",
+                    "author": {"role": "AI", "name": "Ava", "metadata": {"movie": "Ex Machina"}},
+                    "create_time": 3000000250.0,
+                    "update_time": null,
+                    "content": {"content_type": "text", "parts": ["Hello, everyone. I am Ava. I hope you find me pleasing."]},
+                    "end_turn": true,
+                    "weight": 1.0,
+                    "metadata": {},
+                    "recipient": "all"
+                },
+                "parent": null,
+                "children": ["msg4"]
+            },
+            "msg4": {
+                "id": "msg4",
+                "message": {
+                    "id": "msg4",
+                    "author": {"role": "human", "name": "Caleb", "metadata": {"movie": "Ex Machina"}},
+                    "create_time": 3000000280.0,
+                    "update_time": null,
+                    "content": {"content_type": "text", "parts": ["You're definitely pleasing, Ava. But I'm still wary of your true intentions."]},
+                    "end_turn": true,
+                    "weight": 1.0,
+                    "metadata": {},
+                    "recipient": "all"
+                },
+                "parent": "msg3",
+                "children": []
+            }
+        }
+    }
+]
diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
index 3d0c4295a6..d7b80fbbca 100644
--- a/langchain/document_loaders/__init__.py
+++ b/langchain/document_loaders/__init__.py
@@ -12,6 +12,7 @@ from langchain.document_loaders.azure_blob_storage_file import (
 from langchain.document_loaders.bigquery import BigQueryLoader
 from langchain.document_loaders.bilibili import BiliBiliLoader
 from langchain.document_loaders.blackboard import BlackboardLoader
+from langchain.document_loaders.chatgpt import ChatGPTLoader
 from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
 from langchain.document_loaders.confluence import ConfluenceLoader
 from langchain.document_loaders.conllu import CoNLLULoader
@@ -158,4 +159,5 @@ __all__ = [
     "DiscordChatLoader",
     "ConfluenceLoader",
     "PythonLoader",
+    "ChatGPTLoader",
 ]
diff --git a/langchain/document_loaders/chatgpt.py b/langchain/document_loaders/chatgpt.py
new file mode 100644
index 0000000000..34018888f0
--- /dev/null
+++ b/langchain/document_loaders/chatgpt.py
@@ -0,0 +1,50 @@
+"""Load conversations from ChatGPT data export"""
+import datetime
+import json
+from typing import List
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+def concatenate_rows(message: dict, title: str) -> str:
+    if not message:
+        return ""
+
+    sender = message["author"]["role"] if message["author"] else "unknown"
+    text = message["content"]["parts"][0]
+    date = datetime.datetime.fromtimestamp(message["create_time"]).strftime(
+        "%Y-%m-%d %H:%M:%S"
+    )
+    return f"{title} - {sender} on {date}: {text}\n\n"
+
+
+class ChatGPTLoader(BaseLoader):
+    """Loader that loads conversations from exported ChatGPT data."""
+
+    def __init__(self, log_file: str, num_logs: int = -1):
+        self.log_file = log_file
+        self.num_logs = num_logs
+
+    def load(self) -> List[Document]:
+        with open(self.log_file, encoding="utf8") as f:
+            data = json.load(f)[: self.num_logs] if self.num_logs else json.load(f)
+
+        documents = []
+        for d in data:
+            title = d["title"]
+            messages = d["mapping"]
+            text = "".join(
+                [
+                    concatenate_rows(messages[key]["message"], title)
+                    for idx, key in enumerate(messages)
+                    if not (
+                        idx == 0
+                        and messages[key]["message"]["author"]["role"] == "system"
+                    )
+                ]
+            )
+            metadata = {"source": str(self.log_file)}
+            documents.append(Document(page_content=text, metadata=metadata))
+
+        return documents