From c60954d0f85fbb0971f54cbcb3497eb5fdd72baf Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Wed, 15 Feb 2023 23:24:32 -0800 Subject: [PATCH] Harrison/telegram loader (#1080) Co-authored-by: Maxime Vidal --- .../examples/directory_loader.ipynb | 2 +- .../examples/example_data/telegram.json | 31 +++++++ .../document_loaders/examples/telegram.ipynb | 84 +++++++++++++++++++ langchain/document_loaders/__init__.py | 6 +- langchain/document_loaders/telegram.py | 49 +++++++++++ 5 files changed, 169 insertions(+), 3 deletions(-) create mode 100644 docs/modules/document_loaders/examples/example_data/telegram.json create mode 100644 docs/modules/document_loaders/examples/telegram.ipynb create mode 100644 langchain/document_loaders/telegram.py diff --git a/docs/modules/document_loaders/examples/directory_loader.ipynb b/docs/modules/document_loaders/examples/directory_loader.ipynb index 0a268f3d..5ffe1a84 100644 --- a/docs/modules/document_loaders/examples/directory_loader.ipynb +++ b/docs/modules/document_loaders/examples/directory_loader.ipynb @@ -93,7 +93,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/modules/document_loaders/examples/example_data/telegram.json b/docs/modules/document_loaders/examples/example_data/telegram.json new file mode 100644 index 00000000..733cfcc1 --- /dev/null +++ b/docs/modules/document_loaders/examples/example_data/telegram.json @@ -0,0 +1,31 @@ +{ + "name": "Grace 🧤", + "type": "personal_chat", + "id": 2730825451, + "messages": [ + { + "id": 1980499, + "type": "message", + "date": "2020-01-01T00:00:02", + "from": "Henry", + "from_id": 4325636679, + "text": "It's 2020..." + }, + { + "id": 1980500, + "type": "message", + "date": "2020-01-01T00:00:04", + "from": "Henry", + "from_id": 4325636679, + "text": "Fireworks!" + }, + { + "id": 1980501, + "type": "message", + "date": "2020-01-01T00:00:05", + "from": "Grace 🧤 🍒", + "from_id": 4720225552, + "text": "You're a minute late!" + } + ] +} \ No newline at end of file diff --git a/docs/modules/document_loaders/examples/telegram.ipynb b/docs/modules/document_loaders/examples/telegram.ipynb new file mode 100644 index 00000000..ca561645 --- /dev/null +++ b/docs/modules/document_loaders/examples/telegram.ipynb @@ -0,0 +1,84 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "33205b12", + "metadata": {}, + "source": [ + "# Telegram\n", + "\n", + "This notebook covers how to load data from Telegram into a format that can be ingested into LangChain." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "90b69c94", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import TelegramChatLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "13deb0f5", + "metadata": {}, + "outputs": [], + "source": [ + "loader = TelegramChatLoader(\"example_data/telegram.json\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "9ccc1e2f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content=\"Henry on 2020-01-01T00:00:02: It's 2020...\\n\\nHenry on 2020-01-01T00:00:04: Fireworks!\\n\\nGrace 🧤 ðŸ\\x8d’ on 2020-01-01T00:00:05: You're a minute late!\\n\\n\", lookup_str='', metadata={'source': 'example_data/telegram.json'}, lookup_index=0)]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e64cac2", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index c9f25800..fd2eac62 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -6,7 +6,7 @@ from langchain.document_loaders.college_confidential import CollegeConfidentialL from langchain.document_loaders.directory import DirectoryLoader from langchain.document_loaders.docx import UnstructuredDocxLoader from langchain.document_loaders.email import UnstructuredEmailLoader -from langchain.document_loaders.everynote import EveryNoteLoader +from langchain.document_loaders.evernote import EverNoteLoader from langchain.document_loaders.gcs_directory import GCSDirectoryLoader from langchain.document_loaders.gcs_file import GCSFileLoader from langchain.document_loaders.googledrive import GoogleDriveLoader @@ -23,6 +23,7 @@ from langchain.document_loaders.readthedocs import ReadTheDocsLoader from langchain.document_loaders.roam import RoamLoader from langchain.document_loaders.s3_directory import S3DirectoryLoader from langchain.document_loaders.s3_file import S3FileLoader +from langchain.document_loaders.telegram import TelegramChatLoader from langchain.document_loaders.text import TextLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader from langchain.document_loaders.url import UnstructuredURLLoader @@ -55,8 +56,9 @@ __all__ = [ "CollegeConfidentialLoader", "GutenbergLoader", "PagedPDFSplitter", - "EveryNoteLoader", + "EverNoteLoader", "AirbyteJSONLoader", "OnlinePDFLoader", "PDFMinerLoader", + "TelegramChatLoader", ] diff --git a/langchain/document_loaders/telegram.py b/langchain/document_loaders/telegram.py new file mode 100644 index 00000000..3a7ae860 --- /dev/null +++ b/langchain/document_loaders/telegram.py @@ -0,0 +1,49 @@ +"""Loader that loads Telegram chat json dump.""" +import json +from pathlib import Path +from typing import List + +import pandas as pd + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +def concatenate_rows(row: dict) -> str: + """Combine message information in a readable format ready to be used.""" + date = row["date"] + sender = row["from"] + text = row["text"] + return f"{sender} on {date}: {text}\n\n" + + +class TelegramChatLoader(BaseLoader): + """Loader that loads Telegram chat json directory dump.""" + + def __init__(self, path: str): + """Initialize with path.""" + self.file_path = path + + def load(self) -> List[Document]: + """Load documents.""" + p = Path(self.file_path) + + with open(p, encoding="utf8") as f: + d = json.load(f) + + normalized_messages = pd.json_normalize(d["messages"]) + df_normalized_messages = pd.DataFrame(normalized_messages) + + # Only keep plain text messages (no services, links, hashtags, code, bold...) + df_filtered = df_normalized_messages[ + (df_normalized_messages.type == "message") + & (df_normalized_messages.text.apply(lambda x: type(x) == str)) + ] + + df_filtered = df_filtered[["date", "text", "from"]] + + text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep="") + + metadata = {"source": str(p)} + + return [Document(page_content=text, metadata=metadata)]