import json import logging import os import tempfile import zipfile from pathlib import Path from typing import Iterator, List, Union from langchain_core.chat_loaders import BaseChatLoader from langchain_core.chat_sessions import ChatSession from langchain_core.messages import AIMessage, BaseMessage, HumanMessage logger = logging.getLogger(__name__) class TelegramChatLoader(BaseChatLoader): """Load `telegram` conversations to LangChain chat messages. To export, use the Telegram Desktop app from https://desktop.telegram.org/, select a conversation, click the three dots in the top right corner, and select "Export chat history". Then select "Machine-readable JSON" (preferred) to export. Note: the 'lite' versions of the desktop app (like "Telegram for MacOS") do not support exporting chat history. """ def __init__( self, path: Union[str, Path], ): """Initialize the TelegramChatLoader. Args: path (Union[str, Path]): Path to the exported Telegram chat zip, directory, json, or HTML file. """ self.path = path if isinstance(path, str) else str(path) def _load_single_chat_session_html(self, file_path: str) -> ChatSession: """Load a single chat session from an HTML file. Args: file_path (str): Path to the HTML file. Returns: ChatSession: The loaded chat session. """ try: from bs4 import BeautifulSoup except ImportError: raise ImportError( "Please install the 'beautifulsoup4' package to load" " Telegram HTML files. You can do this by running" "'pip install beautifulsoup4' in your terminal." ) with open(file_path, "r", encoding="utf-8") as file: soup = BeautifulSoup(file, "html.parser") results: List[Union[HumanMessage, AIMessage]] = [] previous_sender = None for message in soup.select(".message.default"): timestamp = message.select_one(".pull_right.date.details")["title"] from_name_element = message.select_one(".from_name") if from_name_element is None and previous_sender is None: logger.debug("from_name not found in message") continue elif from_name_element is None: from_name = previous_sender else: from_name = from_name_element.text.strip() text = message.select_one(".text").text.strip() results.append( HumanMessage( content=text, additional_kwargs={ "sender": from_name, "events": [{"message_time": timestamp}], }, ) ) previous_sender = from_name return ChatSession(messages=results) def _load_single_chat_session_json(self, file_path: str) -> ChatSession: """Load a single chat session from a JSON file. Args: file_path (str): Path to the JSON file. Returns: ChatSession: The loaded chat session. """ with open(file_path, "r", encoding="utf-8") as file: data = json.load(file) messages = data.get("messages", []) results: List[BaseMessage] = [] for message in messages: text = message.get("text", "") timestamp = message.get("date", "") from_name = message.get("from", "") results.append( HumanMessage( content=text, additional_kwargs={ "sender": from_name, "events": [{"message_time": timestamp}], }, ) ) return ChatSession(messages=results) def _iterate_files(self, path: str) -> Iterator[str]: """Iterate over files in a directory or zip file. Args: path (str): Path to the directory or zip file. Yields: str: Path to each file. """ if os.path.isfile(path) and path.endswith((".html", ".json")): yield path elif os.path.isdir(path): for root, _, files in os.walk(path): for file in files: if file.endswith((".html", ".json")): yield os.path.join(root, file) elif zipfile.is_zipfile(path): with zipfile.ZipFile(path) as zip_file: for file in zip_file.namelist(): if file.endswith((".html", ".json")): with tempfile.TemporaryDirectory() as temp_dir: yield zip_file.extract(file, path=temp_dir) def lazy_load(self) -> Iterator[ChatSession]: """Lazy load the messages from the chat file and yield them in as chat sessions. Yields: ChatSession: The loaded chat session. """ for file_path in self._iterate_files(self.path): if file_path.endswith(".html"): yield self._load_single_chat_session_html(file_path) elif file_path.endswith(".json"): yield self._load_single_chat_session_json(file_path)