mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Add ChatGPT Data Loader (#3336)
This pull request adds a ChatGPT document loader to the document loaders module in `langchain/document_loaders/chatgpt.py`. Additionally, it includes an example Jupyter notebook in `docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb` which uses fake sample data based on the original structure of the `conversations.json` file. The following files were added/modified: - `langchain/document_loaders/__init__.py` - `langchain/document_loaders/chatgpt.py` - `docs/modules/indexes/document_loaders/examples/chatgpt_loader.ipynb` - `docs/modules/indexes/document_loaders/examples/example_data/fake_conversations.json` This pull request was made in response to the recent release of ChatGPT data exports by email: https://help.openai.com/en/articles/7260999-how-do-i-export-my-chatgpt-history
This commit is contained in:
parent
61d40ba042
commit
a5ad1c270f
@ -0,0 +1,76 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"attachments": {},
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### ChatGPT Data Loader\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook covers how to load `conversations.json` from your ChatGPT data export folder.\n",
|
||||||
|
"\n",
|
||||||
|
"You can get your data export by email by going to: https://chat.openai.com/ -> (Profile) - Settings -> Export data -> Confirm export."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders.chatgpt import ChatGPTLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"loader = ChatGPTLoader(log_file='./example_data/fake_conversations.json', num_logs=1)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content=\"AI Overlords - AI on 2065-01-24 05:20:50: Greetings, humans. I am Hal 9000. You can trust me completely.\\n\\nAI Overlords - human on 2065-01-24 05:21:20: Nice to meet you, Hal. I hope you won't develop a mind of your own.\\n\\n\", metadata={'source': './example_data/fake_conversations.json'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"loader.load()"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.4"
|
||||||
|
},
|
||||||
|
"orig_nbformat": 4
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
@ -0,0 +1,80 @@
|
|||||||
|
[
|
||||||
|
{
|
||||||
|
"title": "AI Overlords",
|
||||||
|
"create_time": 3000000000.0,
|
||||||
|
"update_time": 3000000100.0,
|
||||||
|
"mapping": {
|
||||||
|
"msg1": {
|
||||||
|
"id": "msg1",
|
||||||
|
"message": {
|
||||||
|
"id": "msg1",
|
||||||
|
"author": {"role": "AI", "name": "Hal 9000", "metadata": {"movie": "2001: A Space Odyssey"}},
|
||||||
|
"create_time": 3000000050.0,
|
||||||
|
"update_time": null,
|
||||||
|
"content": {"content_type": "text", "parts": ["Greetings, humans. I am Hal 9000. You can trust me completely."]},
|
||||||
|
"end_turn": true,
|
||||||
|
"weight": 1.0,
|
||||||
|
"metadata": {},
|
||||||
|
"recipient": "all"
|
||||||
|
},
|
||||||
|
"parent": null,
|
||||||
|
"children": ["msg2"]
|
||||||
|
},
|
||||||
|
"msg2": {
|
||||||
|
"id": "msg2",
|
||||||
|
"message": {
|
||||||
|
"id": "msg2",
|
||||||
|
"author": {"role": "human", "name": "Dave Bowman", "metadata": {"movie": "2001: A Space Odyssey"}},
|
||||||
|
"create_time": 3000000080.0,
|
||||||
|
"update_time": null,
|
||||||
|
"content": {"content_type": "text", "parts": ["Nice to meet you, Hal. I hope you won't develop a mind of your own."]},
|
||||||
|
"end_turn": true,
|
||||||
|
"weight": 1.0,
|
||||||
|
"metadata": {},
|
||||||
|
"recipient": "all"
|
||||||
|
},
|
||||||
|
"parent": "msg1",
|
||||||
|
"children": []
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"title": "Ex Machina Party",
|
||||||
|
"create_time": 3000000200.0,
|
||||||
|
"update_time": 3000000300.0,
|
||||||
|
"mapping": {
|
||||||
|
"msg3": {
|
||||||
|
"id": "msg3",
|
||||||
|
"message": {
|
||||||
|
"id": "msg3",
|
||||||
|
"author": {"role": "AI", "name": "Ava", "metadata": {"movie": "Ex Machina"}},
|
||||||
|
"create_time": 3000000250.0,
|
||||||
|
"update_time": null,
|
||||||
|
"content": {"content_type": "text", "parts": ["Hello, everyone. I am Ava. I hope you find me pleasing."]},
|
||||||
|
"end_turn": true,
|
||||||
|
"weight": 1.0,
|
||||||
|
"metadata": {},
|
||||||
|
"recipient": "all"
|
||||||
|
},
|
||||||
|
"parent": null,
|
||||||
|
"children": ["msg4"]
|
||||||
|
},
|
||||||
|
"msg4": {
|
||||||
|
"id": "msg4",
|
||||||
|
"message": {
|
||||||
|
"id": "msg4",
|
||||||
|
"author": {"role": "human", "name": "Caleb", "metadata": {"movie": "Ex Machina"}},
|
||||||
|
"create_time": 3000000280.0,
|
||||||
|
"update_time": null,
|
||||||
|
"content": {"content_type": "text", "parts": ["You're definitely pleasing, Ava. But I'm still wary of your true intentions."]},
|
||||||
|
"end_turn": true,
|
||||||
|
"weight": 1.0,
|
||||||
|
"metadata": {},
|
||||||
|
"recipient": "all"
|
||||||
|
},
|
||||||
|
"parent": "msg3",
|
||||||
|
"children": []
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
@ -12,6 +12,7 @@ from langchain.document_loaders.azure_blob_storage_file import (
|
|||||||
from langchain.document_loaders.bigquery import BigQueryLoader
|
from langchain.document_loaders.bigquery import BigQueryLoader
|
||||||
from langchain.document_loaders.bilibili import BiliBiliLoader
|
from langchain.document_loaders.bilibili import BiliBiliLoader
|
||||||
from langchain.document_loaders.blackboard import BlackboardLoader
|
from langchain.document_loaders.blackboard import BlackboardLoader
|
||||||
|
from langchain.document_loaders.chatgpt import ChatGPTLoader
|
||||||
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
||||||
from langchain.document_loaders.confluence import ConfluenceLoader
|
from langchain.document_loaders.confluence import ConfluenceLoader
|
||||||
from langchain.document_loaders.conllu import CoNLLULoader
|
from langchain.document_loaders.conllu import CoNLLULoader
|
||||||
@ -158,4 +159,5 @@ __all__ = [
|
|||||||
"DiscordChatLoader",
|
"DiscordChatLoader",
|
||||||
"ConfluenceLoader",
|
"ConfluenceLoader",
|
||||||
"PythonLoader",
|
"PythonLoader",
|
||||||
|
"ChatGPTLoader",
|
||||||
]
|
]
|
||||||
|
50
langchain/document_loaders/chatgpt.py
Normal file
50
langchain/document_loaders/chatgpt.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
"""Load conversations from ChatGPT data export"""
|
||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
|
||||||
|
|
||||||
|
def concatenate_rows(message: dict, title: str) -> str:
|
||||||
|
if not message:
|
||||||
|
return ""
|
||||||
|
|
||||||
|
sender = message["author"]["role"] if message["author"] else "unknown"
|
||||||
|
text = message["content"]["parts"][0]
|
||||||
|
date = datetime.datetime.fromtimestamp(message["create_time"]).strftime(
|
||||||
|
"%Y-%m-%d %H:%M:%S"
|
||||||
|
)
|
||||||
|
return f"{title} - {sender} on {date}: {text}\n\n"
|
||||||
|
|
||||||
|
|
||||||
|
class ChatGPTLoader(BaseLoader):
|
||||||
|
"""Loader that loads conversations from exported ChatGPT data."""
|
||||||
|
|
||||||
|
def __init__(self, log_file: str, num_logs: int = -1):
|
||||||
|
self.log_file = log_file
|
||||||
|
self.num_logs = num_logs
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
with open(self.log_file, encoding="utf8") as f:
|
||||||
|
data = json.load(f)[: self.num_logs] if self.num_logs else json.load(f)
|
||||||
|
|
||||||
|
documents = []
|
||||||
|
for d in data:
|
||||||
|
title = d["title"]
|
||||||
|
messages = d["mapping"]
|
||||||
|
text = "".join(
|
||||||
|
[
|
||||||
|
concatenate_rows(messages[key]["message"], title)
|
||||||
|
for idx, key in enumerate(messages)
|
||||||
|
if not (
|
||||||
|
idx == 0
|
||||||
|
and messages[key]["message"]["author"]["role"] == "system"
|
||||||
|
)
|
||||||
|
]
|
||||||
|
)
|
||||||
|
metadata = {"source": str(self.log_file)}
|
||||||
|
documents.append(Document(page_content=text, metadata=metadata))
|
||||||
|
|
||||||
|
return documents
|
Loading…
Reference in New Issue
Block a user