langchain/libs/community/langchain_community/document_loaders/slack_directory.py

import json
import zipfile
from pathlib import Path
from typing import Dict, List, Optional

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader


class SlackDirectoryLoader(BaseLoader):
    """Load from a `Slack` directory dump."""

    def __init__(self, zip_path: str, workspace_url: Optional[str] = None):
        """Initialize the SlackDirectoryLoader.

        Args:
            zip_path (str): The path to the Slack directory dump zip file.
            workspace_url (Optional[str]): The Slack workspace URL.
              Including the URL will turn
              sources into links. Defaults to None.
        """
        self.zip_path = Path(zip_path)
        self.workspace_url = workspace_url
        self.channel_id_map = self._get_channel_id_map(self.zip_path)

    @staticmethod
    def _get_channel_id_map(zip_path: Path) -> Dict[str, str]:
        """Get a dictionary mapping channel names to their respective IDs."""
        with zipfile.ZipFile(zip_path, "r") as zip_file:
            try:
                with zip_file.open("channels.json", "r") as f:
                    channels = json.load(f)
                return {channel["name"]: channel["id"] for channel in channels}
            except KeyError:
                return {}

    def load(self) -> List[Document]:
        """Load and return documents from the Slack directory dump."""
        docs = []
        with zipfile.ZipFile(self.zip_path, "r") as zip_file:
            for channel_path in zip_file.namelist():
                channel_name = Path(channel_path).parent.name
                if not channel_name:
                    continue
                if channel_path.endswith(".json"):
                    messages = self._read_json(zip_file, channel_path)
                    for message in messages:
                        document = self._convert_message_to_document(
                            message, channel_name
                        )
                        docs.append(document)
        return docs

    def _read_json(self, zip_file: zipfile.ZipFile, file_path: str) -> List[dict]:
        """Read JSON data from a zip subfile."""
        with zip_file.open(file_path, "r") as f:
            data = json.load(f)
        return data

    def _convert_message_to_document(
        self, message: dict, channel_name: str
    ) -> Document:
        """
        Convert a message to a Document object.

        Args:
            message (dict): A message in the form of a dictionary.
            channel_name (str): The name of the channel the message belongs to.

        Returns:
            Document: A Document object representing the message.
        """
        text = message.get("text", "")
        metadata = self._get_message_metadata(message, channel_name)
        return Document(
            page_content=text,
            metadata=metadata,
        )

    def _get_message_metadata(self, message: dict, channel_name: str) -> dict:
        """Create and return metadata for a given message and channel."""
        timestamp = message.get("ts", "")
        user = message.get("user", "")
        source = self._get_message_source(channel_name, user, timestamp)
        return {
            "source": source,
            "channel": channel_name,
            "timestamp": timestamp,
            "user": user,
        }

    def _get_message_source(self, channel_name: str, user: str, timestamp: str) -> str:
        """
        Get the message source as a string.

        Args:
            channel_name (str): The name of the channel the message belongs to.
            user (str): The user ID who sent the message.
            timestamp (str): The timestamp of the message.

        Returns:
            str: The message source.
        """
        if self.workspace_url:
            channel_id = self.channel_id_map.get(channel_name, "")
            return (
                f"{self.workspace_url}/archives/{channel_id}"
                + f"/p{timestamp.replace('.', '')}"
            )
        else:
            return f"{channel_name} - {user} - {timestamp}"
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`import json`
			`import zipfile`
			`from pathlib import Path`
			`from typing import Dict, List, Optional`

			`from langchain_core.documents import Document`

			`from langchain_community.document_loaders.base import BaseLoader`


			`class SlackDirectoryLoader(BaseLoader):`
			"""Load from a `Slack` directory dump."""

			`def __init__(self, zip_path: str, workspace_url: Optional[str] = None):`
			`"""Initialize the SlackDirectoryLoader.`

			`Args:`
			`zip_path (str): The path to the Slack directory dump zip file.`
			`workspace_url (Optional[str]): The Slack workspace URL.`
			`Including the URL will turn`
			`sources into links. Defaults to None.`
			`"""`
			`self.zip_path = Path(zip_path)`
			`self.workspace_url = workspace_url`
			`self.channel_id_map = self._get_channel_id_map(self.zip_path)`

			`@staticmethod`
			`def _get_channel_id_map(zip_path: Path) -> Dict[str, str]:`
			`"""Get a dictionary mapping channel names to their respective IDs."""`
			`with zipfile.ZipFile(zip_path, "r") as zip_file:`
			`try:`
			`with zip_file.open("channels.json", "r") as f:`
			`channels = json.load(f)`
			`return {channel["name"]: channel["id"] for channel in channels}`
			`except KeyError:`
			`return {}`

			`def load(self) -> List[Document]:`
			`"""Load and return documents from the Slack directory dump."""`
			`docs = []`
			`with zipfile.ZipFile(self.zip_path, "r") as zip_file:`
			`for channel_path in zip_file.namelist():`
			`channel_name = Path(channel_path).parent.name`
			`if not channel_name:`
			`continue`
			`if channel_path.endswith(".json"):`
			`messages = self._read_json(zip_file, channel_path)`
			`for message in messages:`
			`document = self._convert_message_to_document(`
			`message, channel_name`
			`)`
			`docs.append(document)`
			`return docs`

			`def _read_json(self, zip_file: zipfile.ZipFile, file_path: str) -> List[dict]:`
			`"""Read JSON data from a zip subfile."""`
			`with zip_file.open(file_path, "r") as f:`
			`data = json.load(f)`
			`return data`

			`def _convert_message_to_document(`
			`self, message: dict, channel_name: str`
			`) -> Document:`
			`"""`
			`Convert a message to a Document object.`

			`Args:`
			`message (dict): A message in the form of a dictionary.`
			`channel_name (str): The name of the channel the message belongs to.`

			`Returns:`
			`Document: A Document object representing the message.`
			`"""`
			`text = message.get("text", "")`
			`metadata = self._get_message_metadata(message, channel_name)`
			`return Document(`
			`page_content=text,`
			`metadata=metadata,`
			`)`

			`def _get_message_metadata(self, message: dict, channel_name: str) -> dict:`
			`"""Create and return metadata for a given message and channel."""`
			`timestamp = message.get("ts", "")`
			`user = message.get("user", "")`
			`source = self._get_message_source(channel_name, user, timestamp)`
			`return {`
			`"source": source,`
			`"channel": channel_name,`
			`"timestamp": timestamp,`
			`"user": user,`
			`}`

			`def _get_message_source(self, channel_name: str, user: str, timestamp: str) -> str:`
			`"""`
			`Get the message source as a string.`

			`Args:`
			`channel_name (str): The name of the channel the message belongs to.`
			`user (str): The user ID who sent the message.`
			`timestamp (str): The timestamp of the message.`

			`Returns:`
			`str: The message source.`
			`"""`
			`if self.workspace_url:`
			`channel_id = self.channel_id_map.get(channel_name, "")`
			`return (`
			`f"{self.workspace_url}/archives/{channel_id}"`
			`+ f"/p{timestamp.replace('.', '')}"`
			`)`
			`else:`
			`return f"{channel_name} - {user} - {timestamp}"`