Add Slack Directory Loader (#2841)

Fixes linting issue from #2835 Adds a loader for Slack Exports which can be a very valuable source of knowledge to use for internal QA bots and other use cases. ```py # Export data from your Slack Workspace first. from langchain.document_loaders import SLackDirectoryLoader SLACK_WORKSPACE_URL = "https://awesome.slack.com" loader = ("Slack_Exports", SLACK_WORKSPACE_URL) docs = loader.load() ```
2023-04-13 21:31:59 -07:00 · 2023-04-13 21:31:59 -07:00 · bf0887c486
commit bf0887c486
parent ed2ef5cbe4
5 changed files with 218 additions and 0 deletions
--- a/docs/modules/indexes/document_loaders/examples/slack_directory.ipynb
+++ b/docs/modules/indexes/document_loaders/examples/slack_directory.ipynb
@ -0,0 +1,81 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "1dc7df1d",
+   "metadata": {},
+   "source": [
+    "# Slack (Local Exported Zipfile)\n",
+    "\n",
+    "This notebook covers how to load documents from a Zipfile generated from a Slack export.\n",
+    "\n",
+    "In order to get this Slack export, follow these instructions:\n",
+    "\n",
+    "## 🧑 Instructions for ingesting your own dataset\n",
+    "\n",
+    "Export your Slack data. You can do this by going to your Workspace Management page and clicking the Import/Export option ({your_slack_domain}.slack.com/services/export). Then, choose the right date range and click `Start export`. Slack will send you an email and a DM when the export is ready.\n",
+    "\n",
+    "The download will produce a `.zip` file in your Downloads folder (or wherever your downloads can be found, depending on your OS configuration).\n",
+    "\n",
+    "Copy the path to the `.zip` file, and assign it as `LOCAL_ZIPFILE` below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "007c5cbf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import SlackDirectoryLoader "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1caec59",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optionally set your Slack URL. This will give you proper URLs in the docs sources.\n",
+    "SLACK_WORKSPACE_URL = \"https://xxx.slack.com\"\n",
+    "LOCAL_ZIPFILE = \"\" # Paste the local paty to your Slack zip file here.\n",
+    "\n",
+    "loader = SlackDirectoryLoader(LOCAL_ZIPFILE, SLACK_WORKSPACE_URL)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1c30ff7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()\n",
+    "docs"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/langchain/document_loaders/init.py
+++ b/langchain/document_loaders/init.py
@ -55,6 +55,7 @@ from langchain.document_loaders.roam import RoamLoader
 from langchain.document_loaders.s3_directory import S3DirectoryLoader
 from langchain.document_loaders.s3_file import S3FileLoader
 from langchain.document_loaders.sitemap import SitemapLoader
+from langchain.document_loaders.slack_directory import SlackDirectoryLoader
 from langchain.document_loaders.srt import SRTLoader
 from langchain.document_loaders.telegram import TelegramChatLoader
 from langchain.document_loaders.text import TextLoader
@ -138,4 +139,5 @@ __all__ = [
    "DuckDBLoader",
    "BigQueryLoader",
    "BiliBiliLoader",
+    "SlackDirectoryLoader",
 ]
--- a/langchain/document_loaders/slack_directory.py
+++ b/langchain/document_loaders/slack_directory.py
@ -0,0 +1,112 @@
+"""Loader for documents from a Slack export."""
+import json
+import zipfile
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class SlackDirectoryLoader(BaseLoader):
+    """Loader for loading documents from a Slack directory dump."""
+
+    def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
+        """Initialize the SlackDirectoryLoader.
+
+        Args:
+            zip_path (str): The path to the Slack directory dump zip file.
+            workspace_url (Optional[str]): The Slack workspace URL.
+              Including the URL will turn
+              sources into links. Defaults to None.
+        """
+        self.zip_path = Path(zip_path)
+        self.workspace_url = workspace_url
+        self.channel_id_map = self._get_channel_id_map(self.zip_path)
+
+    @staticmethod
+    def _get_channel_id_map(zip_path: Path) -> Dict[str, str]:
+        """Get a dictionary mapping channel names to their respective IDs."""
+        with zipfile.ZipFile(zip_path, "r") as zip_file:
+            try:
+                with zip_file.open("channels.json", "r") as f:
+                    channels = json.load(f)
+                return {channel["name"]: channel["id"] for channel in channels}
+            except KeyError:
+                return {}
+
+    def load(self) -> List[Document]:
+        """Load and return documents from the Slack directory dump."""
+        docs = []
+        with zipfile.ZipFile(self.zip_path, "r") as zip_file:
+            for channel_path in zip_file.namelist():
+                channel_name = Path(channel_path).parent.name
+                if not channel_name:
+                    continue
+                if channel_path.endswith(".json"):
+                    messages = self._read_json(zip_file, channel_path)
+                    for message in messages:
+                        document = self._convert_message_to_document(
+                            message, channel_name
+                        )
+                        docs.append(document)
+        return docs
+
+    def _read_json(self, zip_file: zipfile.ZipFile, file_path: str) -> List[dict]:
+        """Read JSON data from a zip subfile."""
+        with zip_file.open(file_path, "r") as f:
+            data = json.load(f)
+        return data
+
+    def _convert_message_to_document(
+        self, message: dict, channel_name: str
+    ) -> Document:
+        """
+        Convert a message to a Document object.
+
+        Args:
+            message (dict): A message in the form of a dictionary.
+            channel_name (str): The name of the channel the message belongs to.
+
+        Returns:
+            Document: A Document object representing the message.
+        """
+        text = message.get("text", "")
+        metadata = self._get_message_metadata(message, channel_name)
+        return Document(
+            page_content=text,
+            metadata=metadata,
+        )
+
+    def _get_message_metadata(self, message: dict, channel_name: str) -> dict:
+        """Create and return metadata for a given message and channel."""
+        timestamp = message.get("ts", "")
+        user = message.get("user", "")
+        source = self._get_message_source(channel_name, user, timestamp)
+        return {
+            "source": source,
+            "channel": channel_name,
+            "timestamp": timestamp,
+            "user": user,
+        }
+
+    def _get_message_source(self, channel_name: str, user: str, timestamp: str) -> str:
+        """
+        Get the message source as a string.
+
+        Args:
+            channel_name (str): The name of the channel the message belongs to.
+            user (str): The user ID who sent the message.
+            timestamp (str): The timestamp of the message.
+
+        Returns:
+            str: The message source.
+        """
+        if self.workspace_url:
+            channel_id = self.channel_id_map.get(channel_name, "")
+            return (
+                f"{self.workspace_url}/archives/{channel_id}"
+                + f"/p{timestamp.replace('.', '')}"
+            )
+        else:
+            return f"{channel_name} - {user} - {timestamp}"
--- a/tests/integration_tests/document_loaders/test_slack.py
+++ b/tests/integration_tests/document_loaders/test_slack.py
@ -0,0 +1,23 @@
+"""Tests for the Slack directory loader"""
+from pathlib import Path
+
+from langchain.document_loaders import SlackDirectoryLoader
+
+
+def test_slack_directory_loader() -> None:
+    """Test Slack directory loader."""
+    file_path = Path(__file__).parent.parent / "examples/slack_export.zip"
+    loader = SlackDirectoryLoader(str(file_path))
+    docs = loader.load()
+
+    assert len(docs) == 5
+
+
+def test_slack_directory_loader_urls() -> None:
+    """Test workspace URLS are passed through in the SlackDirectoryloader."""
+    file_path = Path(__file__).parent.parent / "examples/slack_export.zip"
+    workspace_url = "example_workspace.com"
+    loader = SlackDirectoryLoader(str(file_path), workspace_url)
+    docs = loader.load()
+    for doc in docs:
+        assert doc.metadata["source"].startswith(workspace_url)
--- a/tests/integration_tests/examples/slack_export.zip
+++ b/tests/integration_tests/examples/slack_export.zip