diff --git a/docs/modules/indexes/document_loaders/examples/slack_directory.ipynb b/docs/modules/indexes/document_loaders/examples/slack_directory.ipynb new file mode 100644 index 00000000..471efa53 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/slack_directory.ipynb @@ -0,0 +1,81 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "1dc7df1d", + "metadata": {}, + "source": [ + "# Slack (Local Exported Zipfile)\n", + "\n", + "This notebook covers how to load documents from a Zipfile generated from a Slack export.\n", + "\n", + "In order to get this Slack export, follow these instructions:\n", + "\n", + "## 🧑 Instructions for ingesting your own dataset\n", + "\n", + "Export your Slack data. You can do this by going to your Workspace Management page and clicking the Import/Export option ({your_slack_domain}.slack.com/services/export). Then, choose the right date range and click `Start export`. Slack will send you an email and a DM when the export is ready.\n", + "\n", + "The download will produce a `.zip` file in your Downloads folder (or wherever your downloads can be found, depending on your OS configuration).\n", + "\n", + "Copy the path to the `.zip` file, and assign it as `LOCAL_ZIPFILE` below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "007c5cbf", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import SlackDirectoryLoader " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1caec59", + "metadata": {}, + "outputs": [], + "source": [ + "# Optionally set your Slack URL. This will give you proper URLs in the docs sources.\n", + "SLACK_WORKSPACE_URL = \"https://xxx.slack.com\"\n", + "LOCAL_ZIPFILE = \"\" # Paste the local paty to your Slack zip file here.\n", + "\n", + "loader = SlackDirectoryLoader(LOCAL_ZIPFILE, SLACK_WORKSPACE_URL)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1c30ff7", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()\n", + "docs" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 956f85f9..c2ea430a 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -55,6 +55,7 @@ from langchain.document_loaders.roam import RoamLoader from langchain.document_loaders.s3_directory import S3DirectoryLoader from langchain.document_loaders.s3_file import S3FileLoader from langchain.document_loaders.sitemap import SitemapLoader +from langchain.document_loaders.slack_directory import SlackDirectoryLoader from langchain.document_loaders.srt import SRTLoader from langchain.document_loaders.telegram import TelegramChatLoader from langchain.document_loaders.text import TextLoader @@ -138,4 +139,5 @@ __all__ = [ "DuckDBLoader", "BigQueryLoader", "BiliBiliLoader", + "SlackDirectoryLoader", ] diff --git a/langchain/document_loaders/slack_directory.py b/langchain/document_loaders/slack_directory.py new file mode 100644 index 00000000..718367c4 --- /dev/null +++ b/langchain/document_loaders/slack_directory.py @@ -0,0 +1,112 @@ +"""Loader for documents from a Slack export.""" +import json +import zipfile +from pathlib import Path +from typing import Dict, List, Optional + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class SlackDirectoryLoader(BaseLoader): + """Loader for loading documents from a Slack directory dump.""" + + def __init__(self, zip_path: str, workspace_url: Optional[str] = None): + """Initialize the SlackDirectoryLoader. + + Args: + zip_path (str): The path to the Slack directory dump zip file. + workspace_url (Optional[str]): The Slack workspace URL. + Including the URL will turn + sources into links. Defaults to None. + """ + self.zip_path = Path(zip_path) + self.workspace_url = workspace_url + self.channel_id_map = self._get_channel_id_map(self.zip_path) + + @staticmethod + def _get_channel_id_map(zip_path: Path) -> Dict[str, str]: + """Get a dictionary mapping channel names to their respective IDs.""" + with zipfile.ZipFile(zip_path, "r") as zip_file: + try: + with zip_file.open("channels.json", "r") as f: + channels = json.load(f) + return {channel["name"]: channel["id"] for channel in channels} + except KeyError: + return {} + + def load(self) -> List[Document]: + """Load and return documents from the Slack directory dump.""" + docs = [] + with zipfile.ZipFile(self.zip_path, "r") as zip_file: + for channel_path in zip_file.namelist(): + channel_name = Path(channel_path).parent.name + if not channel_name: + continue + if channel_path.endswith(".json"): + messages = self._read_json(zip_file, channel_path) + for message in messages: + document = self._convert_message_to_document( + message, channel_name + ) + docs.append(document) + return docs + + def _read_json(self, zip_file: zipfile.ZipFile, file_path: str) -> List[dict]: + """Read JSON data from a zip subfile.""" + with zip_file.open(file_path, "r") as f: + data = json.load(f) + return data + + def _convert_message_to_document( + self, message: dict, channel_name: str + ) -> Document: + """ + Convert a message to a Document object. + + Args: + message (dict): A message in the form of a dictionary. + channel_name (str): The name of the channel the message belongs to. + + Returns: + Document: A Document object representing the message. + """ + text = message.get("text", "") + metadata = self._get_message_metadata(message, channel_name) + return Document( + page_content=text, + metadata=metadata, + ) + + def _get_message_metadata(self, message: dict, channel_name: str) -> dict: + """Create and return metadata for a given message and channel.""" + timestamp = message.get("ts", "") + user = message.get("user", "") + source = self._get_message_source(channel_name, user, timestamp) + return { + "source": source, + "channel": channel_name, + "timestamp": timestamp, + "user": user, + } + + def _get_message_source(self, channel_name: str, user: str, timestamp: str) -> str: + """ + Get the message source as a string. + + Args: + channel_name (str): The name of the channel the message belongs to. + user (str): The user ID who sent the message. + timestamp (str): The timestamp of the message. + + Returns: + str: The message source. + """ + if self.workspace_url: + channel_id = self.channel_id_map.get(channel_name, "") + return ( + f"{self.workspace_url}/archives/{channel_id}" + + f"/p{timestamp.replace('.', '')}" + ) + else: + return f"{channel_name} - {user} - {timestamp}" diff --git a/tests/integration_tests/document_loaders/test_slack.py b/tests/integration_tests/document_loaders/test_slack.py new file mode 100644 index 00000000..7baa1319 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_slack.py @@ -0,0 +1,23 @@ +"""Tests for the Slack directory loader""" +from pathlib import Path + +from langchain.document_loaders import SlackDirectoryLoader + + +def test_slack_directory_loader() -> None: + """Test Slack directory loader.""" + file_path = Path(__file__).parent.parent / "examples/slack_export.zip" + loader = SlackDirectoryLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 5 + + +def test_slack_directory_loader_urls() -> None: + """Test workspace URLS are passed through in the SlackDirectoryloader.""" + file_path = Path(__file__).parent.parent / "examples/slack_export.zip" + workspace_url = "example_workspace.com" + loader = SlackDirectoryLoader(str(file_path), workspace_url) + docs = loader.load() + for doc in docs: + assert doc.metadata["source"].startswith(workspace_url) diff --git a/tests/integration_tests/examples/slack_export.zip b/tests/integration_tests/examples/slack_export.zip new file mode 100644 index 00000000..756809ad Binary files /dev/null and b/tests/integration_tests/examples/slack_export.zip differ