You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/langchain/document_loaders/telegram.py

50 lines
1.5 KiB
Python

"""Loader that loads Telegram chat json dump."""
import json
from pathlib import Path
from typing import List
import pandas as pd
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
def concatenate_rows(row: dict) -> str:
"""Combine message information in a readable format ready to be used."""
date = row["date"]
sender = row["from"]
text = row["text"]
return f"{sender} on {date}: {text}\n\n"
class TelegramChatLoader(BaseLoader):
"""Loader that loads Telegram chat json directory dump."""
def __init__(self, path: str):
"""Initialize with path."""
self.file_path = path
def load(self) -> List[Document]:
"""Load documents."""
p = Path(self.file_path)
with open(p, encoding="utf8") as f:
d = json.load(f)
normalized_messages = pd.json_normalize(d["messages"])
df_normalized_messages = pd.DataFrame(normalized_messages)
# Only keep plain text messages (no services, links, hashtags, code, bold...)
df_filtered = df_normalized_messages[
(df_normalized_messages.type == "message")
& (df_normalized_messages.text.apply(lambda x: type(x) == str))
]
df_filtered = df_filtered[["date", "text", "from"]]
text = df_filtered.apply(concatenate_rows, axis=1).str.cat(sep="")
metadata = {"source": str(p)}
return [Document(page_content=text, metadata=metadata)]