From bf0887c486f87b2d5d1781971579f8843cc29032 Mon Sep 17 00:00:00 2001 From: vowelparrot <130414180+vowelparrot@users.noreply.github.com> Date: Thu, 13 Apr 2023 21:31:59 -0700 Subject: [PATCH] Add Slack Directory Loader (#2841) Fixes linting issue from #2835 Adds a loader for Slack Exports which can be a very valuable source of knowledge to use for internal QA bots and other use cases. ```py # Export data from your Slack Workspace first. from langchain.document_loaders import SLackDirectoryLoader SLACK_WORKSPACE_URL = "https://awesome.slack.com" loader = ("Slack_Exports", SLACK_WORKSPACE_URL) docs = loader.load() ``` --- .../examples/slack_directory.ipynb | 81 +++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/slack_directory.py | 112 ++++++++++++++++++ .../document_loaders/test_slack.py | 23 ++++ .../examples/slack_export.zip | Bin 0 -> 3904 bytes 5 files changed, 218 insertions(+) create mode 100644 docs/modules/indexes/document_loaders/examples/slack_directory.ipynb create mode 100644 langchain/document_loaders/slack_directory.py create mode 100644 tests/integration_tests/document_loaders/test_slack.py create mode 100644 tests/integration_tests/examples/slack_export.zip diff --git a/docs/modules/indexes/document_loaders/examples/slack_directory.ipynb b/docs/modules/indexes/document_loaders/examples/slack_directory.ipynb new file mode 100644 index 00000000..471efa53 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/slack_directory.ipynb @@ -0,0 +1,81 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "1dc7df1d", + "metadata": {}, + "source": [ + "# Slack (Local Exported Zipfile)\n", + "\n", + "This notebook covers how to load documents from a Zipfile generated from a Slack export.\n", + "\n", + "In order to get this Slack export, follow these instructions:\n", + "\n", + "## 🧑 Instructions for ingesting your own dataset\n", + "\n", + "Export your Slack data. You can do this by going to your Workspace Management page and clicking the Import/Export option ({your_slack_domain}.slack.com/services/export). Then, choose the right date range and click `Start export`. Slack will send you an email and a DM when the export is ready.\n", + "\n", + "The download will produce a `.zip` file in your Downloads folder (or wherever your downloads can be found, depending on your OS configuration).\n", + "\n", + "Copy the path to the `.zip` file, and assign it as `LOCAL_ZIPFILE` below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "007c5cbf", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import SlackDirectoryLoader " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1caec59", + "metadata": {}, + "outputs": [], + "source": [ + "# Optionally set your Slack URL. This will give you proper URLs in the docs sources.\n", + "SLACK_WORKSPACE_URL = \"https://xxx.slack.com\"\n", + "LOCAL_ZIPFILE = \"\" # Paste the local paty to your Slack zip file here.\n", + "\n", + "loader = SlackDirectoryLoader(LOCAL_ZIPFILE, SLACK_WORKSPACE_URL)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1c30ff7", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()\n", + "docs" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 956f85f9..c2ea430a 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -55,6 +55,7 @@ from langchain.document_loaders.roam import RoamLoader from langchain.document_loaders.s3_directory import S3DirectoryLoader from langchain.document_loaders.s3_file import S3FileLoader from langchain.document_loaders.sitemap import SitemapLoader +from langchain.document_loaders.slack_directory import SlackDirectoryLoader from langchain.document_loaders.srt import SRTLoader from langchain.document_loaders.telegram import TelegramChatLoader from langchain.document_loaders.text import TextLoader @@ -138,4 +139,5 @@ __all__ = [ "DuckDBLoader", "BigQueryLoader", "BiliBiliLoader", + "SlackDirectoryLoader", ] diff --git a/langchain/document_loaders/slack_directory.py b/langchain/document_loaders/slack_directory.py new file mode 100644 index 00000000..718367c4 --- /dev/null +++ b/langchain/document_loaders/slack_directory.py @@ -0,0 +1,112 @@ +"""Loader for documents from a Slack export.""" +import json +import zipfile +from pathlib import Path +from typing import Dict, List, Optional + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class SlackDirectoryLoader(BaseLoader): + """Loader for loading documents from a Slack directory dump.""" + + def __init__(self, zip_path: str, workspace_url: Optional[str] = None): + """Initialize the SlackDirectoryLoader. + + Args: + zip_path (str): The path to the Slack directory dump zip file. + workspace_url (Optional[str]): The Slack workspace URL. + Including the URL will turn + sources into links. Defaults to None. + """ + self.zip_path = Path(zip_path) + self.workspace_url = workspace_url + self.channel_id_map = self._get_channel_id_map(self.zip_path) + + @staticmethod + def _get_channel_id_map(zip_path: Path) -> Dict[str, str]: + """Get a dictionary mapping channel names to their respective IDs.""" + with zipfile.ZipFile(zip_path, "r") as zip_file: + try: + with zip_file.open("channels.json", "r") as f: + channels = json.load(f) + return {channel["name"]: channel["id"] for channel in channels} + except KeyError: + return {} + + def load(self) -> List[Document]: + """Load and return documents from the Slack directory dump.""" + docs = [] + with zipfile.ZipFile(self.zip_path, "r") as zip_file: + for channel_path in zip_file.namelist(): + channel_name = Path(channel_path).parent.name + if not channel_name: + continue + if channel_path.endswith(".json"): + messages = self._read_json(zip_file, channel_path) + for message in messages: + document = self._convert_message_to_document( + message, channel_name + ) + docs.append(document) + return docs + + def _read_json(self, zip_file: zipfile.ZipFile, file_path: str) -> List[dict]: + """Read JSON data from a zip subfile.""" + with zip_file.open(file_path, "r") as f: + data = json.load(f) + return data + + def _convert_message_to_document( + self, message: dict, channel_name: str + ) -> Document: + """ + Convert a message to a Document object. + + Args: + message (dict): A message in the form of a dictionary. + channel_name (str): The name of the channel the message belongs to. + + Returns: + Document: A Document object representing the message. + """ + text = message.get("text", "") + metadata = self._get_message_metadata(message, channel_name) + return Document( + page_content=text, + metadata=metadata, + ) + + def _get_message_metadata(self, message: dict, channel_name: str) -> dict: + """Create and return metadata for a given message and channel.""" + timestamp = message.get("ts", "") + user = message.get("user", "") + source = self._get_message_source(channel_name, user, timestamp) + return { + "source": source, + "channel": channel_name, + "timestamp": timestamp, + "user": user, + } + + def _get_message_source(self, channel_name: str, user: str, timestamp: str) -> str: + """ + Get the message source as a string. + + Args: + channel_name (str): The name of the channel the message belongs to. + user (str): The user ID who sent the message. + timestamp (str): The timestamp of the message. + + Returns: + str: The message source. + """ + if self.workspace_url: + channel_id = self.channel_id_map.get(channel_name, "") + return ( + f"{self.workspace_url}/archives/{channel_id}" + + f"/p{timestamp.replace('.', '')}" + ) + else: + return f"{channel_name} - {user} - {timestamp}" diff --git a/tests/integration_tests/document_loaders/test_slack.py b/tests/integration_tests/document_loaders/test_slack.py new file mode 100644 index 00000000..7baa1319 --- /dev/null +++ b/tests/integration_tests/document_loaders/test_slack.py @@ -0,0 +1,23 @@ +"""Tests for the Slack directory loader""" +from pathlib import Path + +from langchain.document_loaders import SlackDirectoryLoader + + +def test_slack_directory_loader() -> None: + """Test Slack directory loader.""" + file_path = Path(__file__).parent.parent / "examples/slack_export.zip" + loader = SlackDirectoryLoader(str(file_path)) + docs = loader.load() + + assert len(docs) == 5 + + +def test_slack_directory_loader_urls() -> None: + """Test workspace URLS are passed through in the SlackDirectoryloader.""" + file_path = Path(__file__).parent.parent / "examples/slack_export.zip" + workspace_url = "example_workspace.com" + loader = SlackDirectoryLoader(str(file_path), workspace_url) + docs = loader.load() + for doc in docs: + assert doc.metadata["source"].startswith(workspace_url) diff --git a/tests/integration_tests/examples/slack_export.zip b/tests/integration_tests/examples/slack_export.zip new file mode 100644 index 0000000000000000000000000000000000000000..756809ad71938a1460b9428c2f79712cbc514be9 GIT binary patch literal 3904 zcmbW4c{G%5AIEQ7%t+R<3`O=BCM1tUGgEeD*9aMm-B^YcS}3IKOOZW;>|2&>B}?{Q z7)^<636HWSdT;fdH}6B|dEfWUx#ryS$9(VKxqhGP`}uyR%r;PSXi4^U7qQvkq^zt6>x@gxM> ztA%UOFo(O5LcT=LJ1oYp8Ebp=4Pw|2gm)D2kNBqA_nKK2lnckzixAs!*%?`n%I?yS zJ{*4@)^(v}4h?ZCa8rOMKX&VL*J@)-_{u9{v=Tl>?WRBr<&HXfPtF{2t;ir}qjqks zmT;G|>%NM;lgDspkdX=Rq19NG`&XY~tO9~=WiB0GVpB^xh@oup9!2?%kdtBI!lm&x zW*M2ta9f%yBO?)2s01rJ?ySPVOb-ps;*R&+5*KYZnV7{t)Bstr&-jz+ zF?LOQLZhlA7iM~*q8r6soija@deF0*O)MuW&c^2cX-r1(M`}^jv8)-BZByvi=h}>X z5s#)|XOH4oIyh9alW;PA)KBzd&qq)v=o=MfhjJJ!Q?|wGV7X`Xl|IyGtY&|zl04XX zW)?T(IcxOx%#@%}6|Tj>?iyW~*Lu0=%5==vY=PtZsq$`xw=yPwIGRDA&ffGh6s==+ z3~t>-9vM<$I+%`W%5gL_s5NALa4I1B;>CzJPch0fT&0pIY>F&8Csy-a^*vk}EO^d3 zFQ;U`;(Gx>ZSvt3LglpwmcvOCTZ41B%FQbz$z=V^3<$L3VV zN0a!ex$d0xn>Vs6pYRmNBVR$9xMZrK@8YP+bC7ABaRUqG#_}@zZ(EO=)qFZfqOYo7 zoVdhLEKzfQG(L4-qI;L{`8J5VEdsyv$?W@)&lT>gJlY^~P!Kr)fLVqZ{&@K7iwQ&y zcNOd8;)sL)g`VUO>h>AQST5JcH_H_jnZE$8bIOaTkm?}^wLM-E}m(AD{#6d40zPPht%)M2cuaz9qF2IAGI z8p~~K*KH<~TgC?)h7|^VWTwsJ@j=G*EaZ~wss+7H(d^m=BEtlDHoCzUww%_Gsi^kN zMH%(+9D881dRlRJ4ZGJIzl^FEbPeyg_Hh!ZxB~4Cg$Kl0LfvJT$r8?+Q8y6O4=Kg! z)oZMdme^AuU)Br6@8j6egDNDU?zG5g@oLDW$}n!2ew%+;R#)XI0t@P-bF*(qQmbne zUxI{q3#AuNN*?llb78fyP{(YRRQ=FJTh5OkTGRX=|vDge&wdd4`JrNdb<%x1D` zqn7S)r>?z!2x}=BVj$t{hdX<#DA=0sf*o`3y(olDF*K*59b+6Jg{?42qc|hsvL?Rk=9=l~_exJ? z|KwV=*0|t$ii@$|%!xJqjl#Tzm*a@P=iT<+;v7asJ5J^RpkiC2=&QF~L3u8}k5#u&ZnG zTkSqA(+L7Pe8_;~HVQ-;3DkKG`f&xcpBjbeXC;}hmh=X!p9+ywU!BRF5YtSZmwh)! zueeQr=P&45S{nGXAmBOTo5cpzFZYlHNl=`*P~-~wP%gL*f+X1C-ElTbnU}qf8jj*3(n4L`9;zu(lmbY+Pr{7pJh~%J zTCMy!25C`ViWO5X zp6^%-PAjr>|+Nndd5&G82bPxSn!> z-b%6IpaApUys~eYZi!9*9K42kmZ*byEvxFcB0eTE%kY#273uF<(Hv=8E3havqZEJt z(RJh2eEQ?D=UlH4%OM#zN3|AVE}PI95{*uZ<0%eKMcH(&xc43t)87|4)`BO$)+e8o zWinZf>Y4R;$IjSGa`dxej*=l&JHIKLVKo^PZv2yTe6ssV<%gHnuGxW-h?HsOr5<}V z9rHLodX5kHE|W5hEf=PuZipf04Ufq4+73@2dUd`6|$Su7=2R$=vY1r1K11{SsmMeoVZ2mRSh=@%=9jnRtv|Tpy0BRUS+{(0b4~2^>gu(x9oGgZ zRZ|7v@}6nD3Y;5Xj(jy<)c(1cu% zx6KXnOvwqam=;uRx!MQN=qfr4@y}wf~7Fj^U)v$DU z+CbVf+C;!~h)Z$#8}A+7^SY9ul?PKTsI%=SUIfi1UJkK&pV<}7)c-=GkKpq$xs-pbhHDWS-l3)3naVB0Op0gw*nMxTZ$*qkOaHk2SFv!4uGN>!8fV4xKt=K+)iXsYfgA zhx#jxa`JU;PgJv(ppoXv+)l-|kl?7ZP|N0GBYPdyQ__#=wfl>^KZOT9X(IJ4^bL1B zY|A1n`Fv;B)}Wv~O!d3#0mqNj208{n^xb~^+hu{~=Vf76WZRbfUHtng2>?vM3nT&L zdUqHERky?7o~pl#f;$VX*+X@g#epx`4u*TaWbe}HfSsimfC=