Add ChatGPT Data Loader (#3336)

This pull request adds a ChatGPT document loader to the document loaders module in `langchain/document_loaders/chatgpt.py`. Additionally, it includes an example Jupyter notebook in `docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb` which uses fake sample data based on the original structure of the `conversations.json` file. The following files were added/modified: - `langchain/document_loaders/__init__.py` - `langchain/document_loaders/chatgpt.py` - `docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb` - `docs/modules/indexes/document_loaders/examples/example_data/fake_conversations.json` This pull request was made in response to the recent release of ChatGPT data exports by email: https://help.openai.com/en/articles/7260999-how-do-i-export-my-chatgpt-history
2024-11-06 03:20:49 +00:00 · 2023-04-22 11:06:24 -05:00 · 2023-04-22 11:06:24 -05:00 · a5ad1c270f
commit a5ad1c270f
parent 61d40ba042
4 changed files with 208 additions and 0 deletions
--- a/docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb
@ -0,0 +1,76 @@
 {
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### ChatGPT Data Loader\n",
    "\n",
    "This notebook covers how to load `conversations.json` from your ChatGPT data export folder.\n",
    "\n",
    "You can get your data export by email by going to: https://chat.openai.com/ -> (Profile) - Settings -> Export data -> Confirm export."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from langchain.document_loaders.chatgpt import ChatGPTLoader"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "loader = ChatGPTLoader(log_file='./example_data/fake_conversations.json', num_logs=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[Document(page_content=\"AI Overlords - AI on 2065-01-24 05:20:50: Greetings, humans. I am Hal 9000. You can trust me completely.\\n\\nAI Overlords - human on 2065-01-24 05:21:20: Nice to meet you, Hal. I hope you won't develop a mind of your own.\\n\\n\", metadata={'source': './example_data/fake_conversations.json'})]"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loader.load()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
 }
--- a/docs/modules/indexes/document_loaders/examples/example_data/fake_conversations.json
+++ b/docs/modules/indexes/document_loaders/examples/example_data/fake_conversations.json
@ -0,0 +1,80 @@
 [
    {
        "title": "AI Overlords",
        "create_time": 3000000000.0,
        "update_time": 3000000100.0,
        "mapping": {
            "msg1": {
                "id": "msg1",
                "message": {
                    "id": "msg1",
                    "author": {"role": "AI", "name": "Hal 9000", "metadata": {"movie": "2001: A Space Odyssey"}},
                    "create_time": 3000000050.0,
                    "update_time": null,
                    "content": {"content_type": "text", "parts": ["Greetings, humans. I am Hal 9000. You can trust me completely."]},
                    "end_turn": true,
                    "weight": 1.0,
                    "metadata": {},
                    "recipient": "all"
                },
                "parent": null,
                "children": ["msg2"]
            },
            "msg2": {
                "id": "msg2",
                "message": {
                    "id": "msg2",
                    "author": {"role": "human", "name": "Dave Bowman", "metadata": {"movie": "2001: A Space Odyssey"}},
                    "create_time": 3000000080.0,
                    "update_time": null,
                    "content": {"content_type": "text", "parts": ["Nice to meet you, Hal. I hope you won't develop a mind of your own."]},
                    "end_turn": true,
                    "weight": 1.0,
                    "metadata": {},
                    "recipient": "all"
                },
                "parent": "msg1",
                "children": []
            }
        }
    },
    {
        "title": "Ex Machina Party",
        "create_time": 3000000200.0,
        "update_time": 3000000300.0,
        "mapping": {
            "msg3": {
                "id": "msg3",
                "message": {
                    "id": "msg3",
                    "author": {"role": "AI", "name": "Ava", "metadata": {"movie": "Ex Machina"}},
                    "create_time": 3000000250.0,
                    "update_time": null,
                    "content": {"content_type": "text", "parts": ["Hello, everyone. I am Ava. I hope you find me pleasing."]},
                    "end_turn": true,
                    "weight": 1.0,
                    "metadata": {},
                    "recipient": "all"
                },
                "parent": null,
                "children": ["msg4"]
            },
            "msg4": {
                "id": "msg4",
                "message": {
                    "id": "msg4",
                    "author": {"role": "human", "name": "Caleb", "metadata": {"movie": "Ex Machina"}},
                    "create_time": 3000000280.0,
                    "update_time": null,
                    "content": {"content_type": "text", "parts": ["You're definitely pleasing, Ava. But I'm still wary of your true intentions."]},
                    "end_turn": true,
                    "weight": 1.0,
                    "metadata": {},
                    "recipient": "all"
                },
                "parent": "msg3",
                "children": []
            }
        }
    }
 ]
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -12,6 +12,7 @@ from langchain.document_loaders.azure_blob_storage_file import (
 from langchain.document_loaders.bigquery import BigQueryLoader
 from langchain.document_loaders.bilibili import BiliBiliLoader
 from langchain.document_loaders.blackboard import BlackboardLoader
 from langchain.document_loaders.chatgpt import ChatGPTLoader
 from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
 from langchain.document_loaders.confluence import ConfluenceLoader
 from langchain.document_loaders.conllu import CoNLLULoader
@ -158,4 +159,5 @@ __all__ = [
    "DiscordChatLoader",
    "ConfluenceLoader",
    "PythonLoader",
    "ChatGPTLoader",
 ]
--- a/langchain/document_loaders/chatgpt.py
+++ b/langchain/document_loaders/chatgpt.py
@ -0,0 +1,50 @@
 """Load conversations from ChatGPT data export"""
 import datetime
 import json
 from typing import List
 from langchain.docstore.document import Document
 from langchain.document_loaders.base import BaseLoader
 def concatenate_rows(message: dict, title: str) -> str:
    if not message:
        return ""
    sender = message["author"]["role"] if message["author"] else "unknown"
    text = message["content"]["parts"][0]
    date = datetime.datetime.fromtimestamp(message["create_time"]).strftime(
        "%Y-%m-%d %H:%M:%S"
    )
    return f"{title} - {sender} on {date}: {text}\n\n"
 class ChatGPTLoader(BaseLoader):
    """Loader that loads conversations from exported ChatGPT data."""
    def __init__(self, log_file: str, num_logs: int = -1):
        self.log_file = log_file
        self.num_logs = num_logs
    def load(self) -> List[Document]:
        with open(self.log_file, encoding="utf8") as f:
            data = json.load(f)[: self.num_logs] if self.num_logs else json.load(f)
        documents = []
        for d in data:
            title = d["title"]
            messages = d["mapping"]
            text = "".join(
                [
                    concatenate_rows(messages[key]["message"], title)
                    for idx, key in enumerate(messages)
                    if not (
                        idx == 0
                        and messages[key]["message"]["author"]["role"] == "system"
                    )
                ]
            )
            metadata = {"source": str(self.log_file)}
            documents.append(Document(page_content=text, metadata=metadata))
        return documents