From bf0887c486f87b2d5d1781971579f8843cc29032 Mon Sep 17 00:00:00 2001
From: vowelparrot <130414180+vowelparrot@users.noreply.github.com>
Date: Thu, 13 Apr 2023 21:31:59 -0700
Subject: [PATCH] Add Slack Directory Loader (#2841)

Fixes linting issue from #2835

Adds a loader for Slack Exports which can be a very valuable source of
knowledge to use for internal QA bots and other use cases.

```py
# Export data from your Slack Workspace first.
from langchain.document_loaders import SLackDirectoryLoader

SLACK_WORKSPACE_URL = "https://awesome.slack.com"

loader = ("Slack_Exports", SLACK_WORKSPACE_URL)
docs = loader.load()
```
---
 .../examples/slack_directory.ipynb            |  81 +++++++++++++
 langchain/document_loaders/__init__.py        |   2 +
 langchain/document_loaders/slack_directory.py | 112 ++++++++++++++++++
 .../document_loaders/test_slack.py            |  23 ++++
 .../examples/slack_export.zip                 | Bin 0 -> 3904 bytes
 5 files changed, 218 insertions(+)
 create mode 100644 docs/modules/indexes/document_loaders/examples/slack_directory.ipynb
 create mode 100644 langchain/document_loaders/slack_directory.py
 create mode 100644 tests/integration_tests/document_loaders/test_slack.py
 create mode 100644 tests/integration_tests/examples/slack_export.zip

diff --git a/docs/modules/indexes/document_loaders/examples/slack_directory.ipynb b/docs/modules/indexes/document_loaders/examples/slack_directory.ipynb
new file mode 100644
index 00000000..471efa53
--- /dev/null
+++ b/docs/modules/indexes/document_loaders/examples/slack_directory.ipynb
@@ -0,0 +1,81 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "id": "1dc7df1d",
+   "metadata": {},
+   "source": [
+    "# Slack (Local Exported Zipfile)\n",
+    "\n",
+    "This notebook covers how to load documents from a Zipfile generated from a Slack export.\n",
+    "\n",
+    "In order to get this Slack export, follow these instructions:\n",
+    "\n",
+    "## 🧑 Instructions for ingesting your own dataset\n",
+    "\n",
+    "Export your Slack data. You can do this by going to your Workspace Management page and clicking the Import/Export option ({your_slack_domain}.slack.com/services/export). Then, choose the right date range and click `Start export`. Slack will send you an email and a DM when the export is ready.\n",
+    "\n",
+    "The download will produce a `.zip` file in your Downloads folder (or wherever your downloads can be found, depending on your OS configuration).\n",
+    "\n",
+    "Copy the path to the `.zip` file, and assign it as `LOCAL_ZIPFILE` below."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "007c5cbf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import SlackDirectoryLoader "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a1caec59",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Optionally set your Slack URL. This will give you proper URLs in the docs sources.\n",
+    "SLACK_WORKSPACE_URL = \"https://xxx.slack.com\"\n",
+    "LOCAL_ZIPFILE = \"\" # Paste the local paty to your Slack zip file here.\n",
+    "\n",
+    "loader = SlackDirectoryLoader(LOCAL_ZIPFILE, SLACK_WORKSPACE_URL)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1c30ff7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "docs = loader.load()\n",
+    "docs"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py
index 956f85f9..c2ea430a 100644
--- a/langchain/document_loaders/__init__.py
+++ b/langchain/document_loaders/__init__.py
@@ -55,6 +55,7 @@ from langchain.document_loaders.roam import RoamLoader
 from langchain.document_loaders.s3_directory import S3DirectoryLoader
 from langchain.document_loaders.s3_file import S3FileLoader
 from langchain.document_loaders.sitemap import SitemapLoader
+from langchain.document_loaders.slack_directory import SlackDirectoryLoader
 from langchain.document_loaders.srt import SRTLoader
 from langchain.document_loaders.telegram import TelegramChatLoader
 from langchain.document_loaders.text import TextLoader
@@ -138,4 +139,5 @@ __all__ = [
     "DuckDBLoader",
     "BigQueryLoader",
     "BiliBiliLoader",
+    "SlackDirectoryLoader",
 ]
diff --git a/langchain/document_loaders/slack_directory.py b/langchain/document_loaders/slack_directory.py
new file mode 100644
index 00000000..718367c4
--- /dev/null
+++ b/langchain/document_loaders/slack_directory.py
@@ -0,0 +1,112 @@
+"""Loader for documents from a Slack export."""
+import json
+import zipfile
+from pathlib import Path
+from typing import Dict, List, Optional
+
+from langchain.docstore.document import Document
+from langchain.document_loaders.base import BaseLoader
+
+
+class SlackDirectoryLoader(BaseLoader):
+    """Loader for loading documents from a Slack directory dump."""
+
+    def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
+        """Initialize the SlackDirectoryLoader.
+
+        Args:
+            zip_path (str): The path to the Slack directory dump zip file.
+            workspace_url (Optional[str]): The Slack workspace URL.
+              Including the URL will turn
+              sources into links. Defaults to None.
+        """
+        self.zip_path = Path(zip_path)
+        self.workspace_url = workspace_url
+        self.channel_id_map = self._get_channel_id_map(self.zip_path)
+
+    @staticmethod
+    def _get_channel_id_map(zip_path: Path) -> Dict[str, str]:
+        """Get a dictionary mapping channel names to their respective IDs."""
+        with zipfile.ZipFile(zip_path, "r") as zip_file:
+            try:
+                with zip_file.open("channels.json", "r") as f:
+                    channels = json.load(f)
+                return {channel["name"]: channel["id"] for channel in channels}
+            except KeyError:
+                return {}
+
+    def load(self) -> List[Document]:
+        """Load and return documents from the Slack directory dump."""
+        docs = []
+        with zipfile.ZipFile(self.zip_path, "r") as zip_file:
+            for channel_path in zip_file.namelist():
+                channel_name = Path(channel_path).parent.name
+                if not channel_name:
+                    continue
+                if channel_path.endswith(".json"):
+                    messages = self._read_json(zip_file, channel_path)
+                    for message in messages:
+                        document = self._convert_message_to_document(
+                            message, channel_name
+                        )
+                        docs.append(document)
+        return docs
+
+    def _read_json(self, zip_file: zipfile.ZipFile, file_path: str) -> List[dict]:
+        """Read JSON data from a zip subfile."""
+        with zip_file.open(file_path, "r") as f:
+            data = json.load(f)
+        return data
+
+    def _convert_message_to_document(
+        self, message: dict, channel_name: str
+    ) -> Document:
+        """
+        Convert a message to a Document object.
+
+        Args:
+            message (dict): A message in the form of a dictionary.
+            channel_name (str): The name of the channel the message belongs to.
+
+        Returns:
+            Document: A Document object representing the message.
+        """
+        text = message.get("text", "")
+        metadata = self._get_message_metadata(message, channel_name)
+        return Document(
+            page_content=text,
+            metadata=metadata,
+        )
+
+    def _get_message_metadata(self, message: dict, channel_name: str) -> dict:
+        """Create and return metadata for a given message and channel."""
+        timestamp = message.get("ts", "")
+        user = message.get("user", "")
+        source = self._get_message_source(channel_name, user, timestamp)
+        return {
+            "source": source,
+            "channel": channel_name,
+            "timestamp": timestamp,
+            "user": user,
+        }
+
+    def _get_message_source(self, channel_name: str, user: str, timestamp: str) -> str:
+        """
+        Get the message source as a string.
+
+        Args:
+            channel_name (str): The name of the channel the message belongs to.
+            user (str): The user ID who sent the message.
+            timestamp (str): The timestamp of the message.
+
+        Returns:
+            str: The message source.
+        """
+        if self.workspace_url:
+            channel_id = self.channel_id_map.get(channel_name, "")
+            return (
+                f"{self.workspace_url}/archives/{channel_id}"
+                + f"/p{timestamp.replace('.', '')}"
+            )
+        else:
+            return f"{channel_name} - {user} - {timestamp}"
diff --git a/tests/integration_tests/document_loaders/test_slack.py b/tests/integration_tests/document_loaders/test_slack.py
new file mode 100644
index 00000000..7baa1319
--- /dev/null
+++ b/tests/integration_tests/document_loaders/test_slack.py
@@ -0,0 +1,23 @@
+"""Tests for the Slack directory loader"""
+from pathlib import Path
+
+from langchain.document_loaders import SlackDirectoryLoader
+
+
+def test_slack_directory_loader() -> None:
+    """Test Slack directory loader."""
+    file_path = Path(__file__).parent.parent / "examples/slack_export.zip"
+    loader = SlackDirectoryLoader(str(file_path))
+    docs = loader.load()
+
+    assert len(docs) == 5
+
+
+def test_slack_directory_loader_urls() -> None:
+    """Test workspace URLS are passed through in the SlackDirectoryloader."""
+    file_path = Path(__file__).parent.parent / "examples/slack_export.zip"
+    workspace_url = "example_workspace.com"
+    loader = SlackDirectoryLoader(str(file_path), workspace_url)
+    docs = loader.load()
+    for doc in docs:
+        assert doc.metadata["source"].startswith(workspace_url)
diff --git a/tests/integration_tests/examples/slack_export.zip b/tests/integration_tests/examples/slack_export.zip
new file mode 100644
index 0000000000000000000000000000000000000000..756809ad71938a1460b9428c2f79712cbc514be9
GIT binary patch
literal 3904
zcmbW4c{G%5AIEQ7%t+R<3`O=BCM1tUGgEeD*9aMm-B^YcS}3IKOOZW;>|2&>B}?{Q
z7)^<636HWSdT;fdH}6B|dEfWUx#ryS$9(VKxqhGP`}<zn8dTJ5007VdXuA-@581Sa
z;*<c83;_TJfE%#1#p3ZeM}mYs!5MF$4*{r>uyR%r;PSXi4^U7qQvkq^zt6>x@gxM>
ztA%UOFo(O5LcT=LJ1oYp8Ebp=4Pw|2gm)D2kNBqA_nKK2lnckzixAs!*%?`n%I?yS
zJ{*4@)^(v}4h?ZCa8rOMKX&VL*J@)-_{u9{v=Tl>?WRBr<&HXfPtF{2t;ir}qjqks
zmT;G|>%NM;lgDspkdX=Rq19NG`&XY~tO9~=WiB0GVpB^xh@oup9!2?%kdtBI!lm&x
zW<!tIBFma(>*M2ta9f%yBO?)2s01rJ?ySPVOb-ps;*R&+5*KYZnV7{t)Bstr&-jz+
zF?LOQLZhlA7iM~*q8r6soija@deF0*O)MuW&c^2cX-r1(M`}^jv8)-BZByvi=h}>X
z5s#)|XOH4oIyh9alW;PA)KBzd&qq)v=o=MfhjJJ!Q?|wGV7X`Xl|IyGtY&|zl04XX
zW)?T(IcxOx%#@%}6|Tj>?iyW~*Lu0=%5==vY=PtZsq$`xw=yPwIGRDA&ffGh6s==+
z3~t>-9vM<$I+%`W%5gL_s5NALa4I1B;>CzJPch0fT&0pIY>F&8Csy-a^*vk}EO^d3
zFQ;U`;(Gx>ZSvt3LglpwmcvOCTZ41B%FQbz$z<Nz5uRmZ2C^e0<BLz>=V^3<$L3VV
zN0a!ex$d0xn>Vs6pYRmNBVR$9xMZrK@8YP+bC7ABaRUqG#_}@zZ(EO=)qFZfqOYo7
zoVdhLEKzfQG(L4-qI;L{`8J5VEdsyv$?W@)&lT>gJlY^~P!Kr)fLVqZ{&@K7iwQ&y
zcNOd8;)sL)g`VUO>h>AQS<!*(Z`5ro{e)K&g!c?hvRNDmWF9pD9N9xj3Lzy8N65gB
zOaH>T5JcH_H_jnZE$8bIOaTkm?}^wLM-E}m(AD{#6d40zPPht%)M2cuaz9qF2IAGI
z8p~~K*KH<~TgC?)h7|^VWTwsJ@j=G*EaZ~wss+7H(d^m=BEtlDHoCzUww%_Gsi^kN
zMH%(+9D881dRlRJ4ZGJIzl^FEbPeyg_Hh!ZxB~4Cg$Kl0LfvJT$r8?+Q8y6O4=Kg!
z)oZMdme^AuU)Br6@8j6egDNDU?zG5g@oLDW$}n!2ew%+;R#)XI0t@P-bF*(qQmbne
zUxI{q3#AuNN*?llb78<i6Jl1FA5)^(jl*Z$*dW#;)Wt73C=*{z#3|#?nliX=r4IDL
zW|oMlDob9+*Cz-y({MWVoRh04RGM6@QhQ(<R+z)IfbEr@erEp@dG_7|e+;oz`%yHK
z>fyP{(YRRQ=F<meTtpu`PT^e5pS_G#zO0ytZk1NzEy*xGuxKLUbdd&UGnXe{N#W1y
zxtN!?x*|erF67St-tyIf@td%2z7c)XhjNA}+YIgam6ZLYw05Mug0*3<2nXM)*w*@h
z@O|gsvqJ}V+y;loxnUiDGDGuE%m{(Z5WJ}MpMlH_(gFbc4u!o~+2&*qR_3C*Li-R*
zO&}}X1ii?ipu`%!I}KFsjo7T*AZa^zf>JTh5OkTGRX=|vDge&wdd4`JrNdb<%x1D`
zqn7S)r>?z!2x}=BVj$t{hdX<#DA=0sf*o`3y(olDF*K*59b+6Jg{?42q<NEyPBuy_
zErbRB`F6cqrN7dY=ACg`+sQ89w~rTJI7KUGqHSNQtiPrm4}K*hBw+0jDc$;cN;UP;
zdZVT1)X9)mDfRhA?PmM^`{Nj$=Q}1(CsIBsQY^Jr_SvGu>c|hsvL?Rk=9=l~_exJ?
z|KwV=*0|t$ii@$|%!xJqj<xu(4@onSKOa#x7t-L+pVA*!rb|+Cl@eWp9ta+~P4*7S
z6)zeS;fFN33uG%g#hM3At#o><On$i}V#O(JMLtm_Y9?x#Xq$4FYopWT(=BB8g+<0y
zZBMS-m!3zn7V4Mvxm>l#Tzm*a@P=iT<+;v7asJ5J^RpkiC2=&QF~L3u8}k5#u&ZnG
zTkSqA(+L7Pe8_;~HVQ-;3DkKG`f&xcpBjbeXC;}hmh=X!p9+ywU!BRF5YtSZmwh)!
zueeQr=P&45S{nGXAmBOTo5cpzFZYlHNl=`*P~-~wP%gL*f+X1C-ElT<Sa&;T{AEXH
zn;j$23*?}9w`bhXpVS2?(|gDIAFs8u35wCAYg6e_@gii87i;$-BRe$OwP;${r=~Ij
zC~3?iMS}#*UD4764Gl{@G{0r6rkk>bnU}qf8jj*3(n4L`9;zu(lmbY+Pr{7pJh~%J
z<r<Xco2qx~T6j?TWg?4_lQ{7Evp@^h?}o)&IXnHA_PKo&u9UT^Sb<03Iu!sM`X%r`
zoCJSeQ)fQ}+x1l=rlbK16f)}cTH(!<;oH(3=_XGx^u)aT){D=>TCMy!25C`ViWO5X
zp6^%-PAjr<i=opMwawPVx+Kw&Op(GRvD9<?7Cnw?z3vZU>>|+Nndd5&G82bPxSn!>
z-b%6IpaApUys~eYZi!9*9K42kmZ*byEvxFcB0eTE%kY#273uF<(Hv=8E3havqZEJt
z(RJh2eEQ?D=UlH4%OM#zN3|AVE}PI95{*uZ<0%eKMcH(&xc43t)87|4)`BO$)+e8o
zWinZf>Y4R;$IjSGa`dxej*=l&JHIKLVKo^PZv2yTe6ssV<%gHnuGxW-h?HsOr5<}V
z9rHLodX5kHE|W5hEf=P<Csw&Y(TvyT90z<-%fls0(1_Fx6_C%Xp!Vl&swgjoO@u@9
zVH6LOrYu%daHdlTZSwHYvn1sV6Fq;@Qy8{;UJrsAXHtTsjZ)-nV4`NXXn#Sw-828N
z@hwp>uZipf04Ufq4+73@2dUd`6|$Su7=2R$=vY1r1<h-mPn2M!a(7kiEm1)tB7M%b
zv&@m+s)au>K11{SsmMeoVZ2mRSh=@%=9jnRtv|Tpy0BRUS+{(0b4~2^>gu(x9oGgZ
zRZ|7v@}6nD3Y;5Xj(j<BEQNTemGh8Cb!bMk9#K}9Du3qKBUOnQ!9Z8d>y<)c(1cu%
zx6<DVd+LN?@zMH4)bpWA8d^d}PmX!j^Kx6X!WW<{4~ndo4m?Phpb4GKCGtFOm(uD%
zT;7OSs1!@lm9l(u$XU3jm{XR|Cg!aay{H#*(;Ktww~_pmftAkyl5qM&iI;Djl&R~*
ztv*uV9G!<9XPUym%C#<X8FS%l>KXnOvwqam=;uRx!MQN=qfr4@y}wf~7Fj^U)v$DU
z+CbVf+C;!~h)Z$#8}A+7^SY9ul?PKTsI%=SUIfi1UJkK&pV<}<XSw`BmsK_Vbhp!B
z>7)c-=GkKpq$xs-pbhHDWS-l3)3naVB0Op0gw*nMxTZ$*qkOaHk2SFv!4u<Sz#i9i
z^)0Rp#WYbJ9_{u`G%L6vM{En=TDzU~ex$B{(ZyiFUGC)PE%YFdjDKj18YK!!5L=tq
zpF22m8p}{=4BdFe2ag+ih7l5H_shOfp_`#m`i534yT>GN>!8fV4xKt=K+)iXsYfgA
zhx#jxa`JU;PgJv(ppoXv+)l-|kl?7ZP|N0GBYPdyQ__#=wfl>^KZOT9X(IJ4^bL1B
zY|A1n`Fv;B)}Wv~O!d3#0mqNj208{n^xb~^+hu{~=Vf76WZRbfUHtng2>?vM3nT&L
zdUqHERky?7o~pl#f;$VX*+X@g#epx`4u*TaWbe}HfSsimfC=<h76H|^v&f!mKTF=7
zM0S?ExI=BPQuoLtI~%uqxa}0*0G;DEqYkWld(`b&_ujie`yabd1!k&$PoaPA+wK{(
sv(9h7#Qu*x*zJ`&wF|-9?oQ_fYu$5CX~4Y#00{VD0B0o6{_S7?0@WYVH2?qr

literal 0
HcmV?d00001