diff --git a/docs/modules/indexes/document_loaders/examples/trello.ipynb b/docs/modules/indexes/document_loaders/examples/trello.ipynb new file mode 100644 index 00000000..8367f2fa --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/trello.ipynb @@ -0,0 +1,184 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Trello\n", + "\n", + ">[Trello](https://www.atlassian.com/software/trello) is a web-based project management and collaboration tool that allows individuals and teams to organize and track their tasks and projects. It provides a visual interface known as a \"board\" where users can create lists and cards to represent their tasks and activities.\n", + "\n", + "The TrelloLoader allows you to load cards from a Trello board and is implemented on top of [py-trello](https://pypi.org/project/py-trello/)\n", + "\n", + "This currently supports `api_key/token` only.\n", + "\n", + "1. Credentials generation: https://trello.com/power-ups/admin/\n", + "\n", + "2. Click in the manual token generation link to get the token.\n", + "\n", + "To specify the API key and token you can either set the environment variables ``TRELLO_API_KEY`` and ``TRELLO_TOKEN`` or you can pass ``api_key`` and ``token`` directly into the `from_credentials` convenience constructor method.\n", + "\n", + "This loader allows you to provide the board name to pull in the corresponding cards into Document objects.\n", + "\n", + "Notice that the board \"name\" is also called \"title\" in oficial documentation:\n", + "\n", + "https://support.atlassian.com/trello/docs/changing-a-boards-title-and-description/\n", + "\n", + "You can also specify several load parameters to include / remove different fields both from the document page_content properties and metadata.\n", + "\n", + "## Features\n", + "- Load cards from a Trello board.\n", + "- Filter cards based on their status (open or closed).\n", + "- Include card names, comments, and checklists in the loaded documents.\n", + "- Customize the additional metadata fields to include in the document.\n", + "\n", + "By default all card fields are included for the full text page_content and metadata accordinly.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#!pip install py-trello beautifulsoup4" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "········\n", + "········\n" + ] + } + ], + "source": [ + "# If you have already set the API key and token using environment variables,\n", + "# you can skip this cell and comment out the `api_key` and `token` named arguments\n", + "# in the initialization steps below.\n", + "from getpass import getpass\n", + "\n", + "API_KEY = getpass()\n", + "TOKEN = getpass()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Review Tech partner pages\n", + "Comments:\n", + "{'title': 'Review Tech partner pages', 'id': '6475357890dc8d17f73f2dcc', 'url': 'https://trello.com/c/b0OTZwkZ/1-review-tech-partner-pages', 'labels': ['Demand Marketing'], 'list': 'Done', 'closed': False, 'due_date': ''}\n" + ] + } + ], + "source": [ + "from langchain.document_loaders import TrelloLoader\n", + "\n", + "# Get the open cards from \"Awesome Board\"\n", + "loader = TrelloLoader.from_credentials(\n", + " \"Awesome Board\",\n", + " api_key=API_KEY,\n", + " token=TOKEN,\n", + " card_filter=\"open\",\n", + " )\n", + "documents = loader.load()\n", + "\n", + "print(documents[0].page_content)\n", + "print(documents[0].metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Review Tech partner pages\n", + "Comments:\n", + "{'title': 'Review Tech partner pages', 'id': '6475357890dc8d17f73f2dcc', 'url': 'https://trello.com/c/b0OTZwkZ/1-review-tech-partner-pages', 'list': 'Done'}\n" + ] + } + ], + "source": [ + "# Get all the cards from \"Awesome Board\" but only include the\n", + "# card list(column) as extra metadata.\n", + "loader = TrelloLoader.from_credentials(\n", + " \"Awesome Board\",\n", + " api_key=API_KEY,\n", + " token=TOKEN,\n", + " extra_metadata=(\"list\"),\n", + ")\n", + "documents = loader.load()\n", + "\n", + "print(documents[0].page_content)\n", + "print(documents[0].metadata)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Get the cards from \"Another Board\" and exclude the card name,\n", + "# checklist and comments from the Document page_content text.\n", + "loader = TrelloLoader.from_credentials(\n", + " \"test\",\n", + " api_key=API_KEY,\n", + " token=TOKEN,\n", + " include_card_name= False,\n", + " include_checklist= False,\n", + " include_comments= False,\n", + ")\n", + "documents = loader.load()\n", + "\n", + "print(\"Document: \" + documents[0].page_content)\n", + "print(documents[0].metadata)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + }, + "vscode": { + "interpreter": { + "hash": "cc99336516f23363341912c6723b01ace86f02e26b4290be1efc0677e2e2ec24" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 3b8a4308..3155fe24 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -92,6 +92,7 @@ from langchain.document_loaders.telegram import ( from langchain.document_loaders.text import TextLoader from langchain.document_loaders.tomarkdown import ToMarkdownLoader from langchain.document_loaders.toml import TomlLoader +from langchain.document_loaders.trello import TrelloLoader from langchain.document_loaders.twitter import TwitterTweetLoader from langchain.document_loaders.unstructured import ( UnstructuredAPIFileIOLoader, @@ -201,6 +202,7 @@ __all__ = [ "StripeLoader", "TextLoader", "TomlLoader", + "TrelloLoader", "TwitterTweetLoader", "UnstructuredAPIFileIOLoader", "UnstructuredAPIFileLoader", diff --git a/langchain/document_loaders/trello.py b/langchain/document_loaders/trello.py new file mode 100644 index 00000000..5c243586 --- /dev/null +++ b/langchain/document_loaders/trello.py @@ -0,0 +1,168 @@ +"""Loader that loads cards from Trello""" +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, List, Literal, Optional, Tuple + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.utils import get_from_env + +if TYPE_CHECKING: + from trello import Board, Card, TrelloClient + + +class TrelloLoader(BaseLoader): + """Trello loader. Reads all cards from a Trello board.""" + + def __init__( + self, + client: TrelloClient, + board_name: str, + *, + include_card_name: bool = True, + include_comments: bool = True, + include_checklist: bool = True, + card_filter: Literal["closed", "open", "all"] = "all", + extra_metadata: Tuple[str, ...] = ("due_date", "labels", "list", "closed"), + ): + """Initialize Trello loader. + + Args: + client: Trello API client. + board_name: The name of the Trello board. + include_card_name: Whether to include the name of the card in the document. + include_comments: Whether to include the comments on the card in the + document. + include_checklist: Whether to include the checklist on the card in the + document. + card_filter: Filter on card status. Valid values are "closed", "open", + "all". + extra_metadata: List of additional metadata fields to include as document + metadata.Valid values are "due_date", "labels", "list", "closed". + + """ + self.client = client + self.board_name = board_name + self.include_card_name = include_card_name + self.include_comments = include_comments + self.include_checklist = include_checklist + self.extra_metadata = extra_metadata + self.card_filter = card_filter + + @classmethod + def from_credentials( + cls, + board_name: str, + *, + api_key: Optional[str] = None, + token: Optional[str] = None, + **kwargs: Any, + ) -> TrelloLoader: + """Convenience constructor that builds TrelloClient init param for you. + + Args: + board_name: The name of the Trello board. + api_key: Trello API key. Can also be specified as environment variable + TRELLO_API_KEY. + token: Trello token. Can also be specified as environment variable + TRELLO_TOKEN. + include_card_name: Whether to include the name of the card in the document. + include_comments: Whether to include the comments on the card in the + document. + include_checklist: Whether to include the checklist on the card in the + document. + card_filter: Filter on card status. Valid values are "closed", "open", + "all". + extra_metadata: List of additional metadata fields to include as document + metadata.Valid values are "due_date", "labels", "list", "closed". + """ + + try: + from trello import TrelloClient # type: ignore + except ImportError as ex: + raise ImportError( + "Could not import trello python package. " + "Please install it with `pip install py-trello`." + ) from ex + api_key = api_key or get_from_env("api_key", "TRELLO_API_KEY") + token = token or get_from_env("token", "TRELLO_TOKEN") + client = TrelloClient(api_key=api_key, token=token) + return cls(client, board_name, **kwargs) + + def load(self) -> List[Document]: + """Loads all cards from the specified Trello board. + + You can filter the cards, metadata and text included by using the optional + parameters. + + Returns: + A list of documents, one for each card in the board. + """ + try: + from bs4 import BeautifulSoup # noqa: F401 + except ImportError as ex: + raise ImportError( + "`beautifulsoup4` package not found, please run" + " `pip install beautifulsoup4`" + ) from ex + + board = self._get_board() + # Create a dictionary with the list IDs as keys and the list names as values + list_dict = {list_item.id: list_item.name for list_item in board.list_lists()} + # Get Cards on the board + cards = board.get_cards(card_filter=self.card_filter) + return [self._card_to_doc(card, list_dict) for card in cards] + + def _get_board(self) -> Board: + # Find the first board with a matching name + board = next( + (b for b in self.client.list_boards() if b.name == self.board_name), None + ) + if not board: + raise ValueError(f"Board `{self.board_name}` not found.") + return board + + def _card_to_doc(self, card: Card, list_dict: dict) -> Document: + from bs4 import BeautifulSoup # type: ignore + + text_content = "" + if self.include_card_name: + text_content = card.name + "\n" + if card.description.strip(): + text_content += BeautifulSoup(card.description, "lxml").get_text() + if self.include_checklist: + # Get all the checklist items on the card + for checklist in card.checklists: + if checklist.items: + items = [ + f"{item['name']}:{item['state']}" for item in checklist.items + ] + text_content += f"\n{checklist.name}\n" + "\n".join(items) + + if self.include_comments: + # Get all the comments on the card + comments = [ + BeautifulSoup(comment["data"]["text"], "lxml").get_text() + for comment in card.comments + ] + text_content += "Comments:" + "\n".join(comments) + + # Default metadata fields + metadata = { + "title": card.name, + "id": card.id, + "url": card.url, + } + + # Extra metadata fields. Card object is not subscriptable. + if "labels" in self.extra_metadata: + metadata["labels"] = [label.name for label in card.labels] + if "list" in self.extra_metadata: + if card.list_id in list_dict: + metadata["list"] = list_dict[card.list_id] + if "closed" in self.extra_metadata: + metadata["closed"] = card.closed + if "due_date" in self.extra_metadata: + metadata["due_date"] = card.due_date + + return Document(page_content=text_content, metadata=metadata) diff --git a/poetry.lock b/poetry.lock index df7d3dc1..70668ca5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -6626,6 +6626,23 @@ files = [ {file = "py-1.11.0.tar.gz", hash = "sha256:51c75c4126074b472f746a24399ad32f6053d1b34b68d2fa41e558e6f4a98719"}, ] +[[package]] +name = "py-trello" +version = "0.19.0" +description = "Python wrapper around the Trello API" +category = "main" +optional = true +python-versions = "*" +files = [ + {file = "py-trello-0.19.0.tar.gz", hash = "sha256:f4a8c05db61fad0ef5fa35d62c29806c75d9d2b797358d9cf77275e2cbf23020"}, +] + +[package.dependencies] +python-dateutil = "*" +pytz = "*" +requests = "*" +requests-oauthlib = ">=0.4.1" + [[package]] name = "pyaes" version = "1.6.1" @@ -10903,7 +10920,7 @@ azure = ["azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices- cohere = ["cohere"] docarray = ["docarray"] embeddings = ["sentence-transformers"] -extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "bibtexparser", "chardet", "gql", "html2text", "jq", "lxml", "pandas", "pdfminer-six", "psychicapi", "pymupdf", "pypdf", "pypdfium2", "requests-toolbelt", "scikit-learn", "telethon", "tqdm", "zep-python"] +extended-testing = ["atlassian-python-api", "beautifulsoup4", "beautifulsoup4", "bibtexparser", "chardet", "gql", "html2text", "jq", "lxml", "pandas", "pdfminer-six", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "requests-toolbelt", "scikit-learn", "telethon", "tqdm", "zep-python"] llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"] openai = ["openai", "tiktoken"] qdrant = ["qdrant-client"] @@ -10912,4 +10929,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "5e83a1f4ca8c0d3107363e393485174fd72ce9db93db5dc7c21b2dd37b184e66" +content-hash = "1033e47cdab7d3a15fb9322bad64609f77fd3befc47c1a01dc91b22cbbc708a3" diff --git a/pyproject.toml b/pyproject.toml index c61b4524..7b3cb809 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -97,6 +97,7 @@ scikit-learn = {version = "^1.2.2", optional = true} azure-ai-formrecognizer = {version = "^3.2.1", optional = true} azure-ai-vision = {version = "^0.11.1b1", optional = true} azure-cognitiveservices-speech = {version = "^1.28.0", optional = true} +py-trello = {version = "^0.19.0", optional = true} momento = {version = "^1.5.0", optional = true} bibtexparser = {version = "^1.4.0", optional = true} @@ -298,6 +299,7 @@ extended_testing = [ "gql", "requests_toolbelt", "html2text", + "py-trello", "scikit-learn", ] diff --git a/tests/unit_tests/document_loaders/test_trello.py b/tests/unit_tests/document_loaders/test_trello.py new file mode 100644 index 00000000..8ef0a118 --- /dev/null +++ b/tests/unit_tests/document_loaders/test_trello.py @@ -0,0 +1,341 @@ +import unittest +from collections import namedtuple +from typing import Any, Optional +from unittest.mock import patch + +import pytest + +from langchain.document_loaders.trello import TrelloLoader + + +def list_to_objects(dict_list: list) -> list: + """Helper to convert dict objects.""" + return [ + namedtuple("Object", d.keys())(**d) for d in dict_list if isinstance(d, dict) + ] + + +def card_list_to_objects(cards: list) -> list: + """Helper to convert dict cards into trello weird mix of objects and dictionaries""" + for card in cards: + card["checklists"] = list_to_objects(card.get("checklists")) + card["labels"] = list_to_objects(card.get("labels")) + return list_to_objects(cards) + + +class MockBoard: + """ + Defining Trello mock board internal object to use in the patched method. + """ + + def __init__(self, id: str, name: str, cards: list, lists: list): + self.id = id + self.name = name + self.cards = cards + self.lists = lists + + def get_cards(self, card_filter: Optional[str] = "") -> list: + """We do not need to test the card-filter since is on Trello Client side.""" + return self.cards + + def list_lists(self) -> list: + return self.lists + + +TRELLO_LISTS = [ + { + "id": "5555cacbc4daa90564b34cf2", + "name": "Publishing Considerations", + }, + { + "id": "5555059b74c03b3a9e362cd0", + "name": "Backlog", + }, + { + "id": "555505a3427fd688c1ca5ebd", + "name": "Selected for Milestone", + }, + { + "id": "555505ba95ff925f9fb1b370", + "name": "Blocked", + }, + { + "id": "555505a695ff925f9fb1b13d", + "name": "In Progress", + }, + { + "id": "555505bdfe380c7edc8ca1a3", + "name": "Done", + }, +] +# Create a mock list of cards. +TRELLO_CARDS_QA = [ + { + "id": "12350aca6952888df7975903", + "name": "Closed Card Title", + "description": "This is the description of Closed Card.", + "closed": True, + "labels": [], + "due_date": "", + "url": "https://trello.com/card/12350aca6952888df7975903", + "list_id": "555505bdfe380c7edc8ca1a3", + "checklists": [ + { + "name": "Checklist 1", + "items": [ + { + "name": "Item 1", + "state": "pending", + }, + { + "name": "Item 2", + "state": "completed", + }, + ], + }, + ], + "comments": [ + { + "data": { + "text": "This is a comment on a Closed Card.", + }, + }, + ], + }, + { + "id": "45650aca6952888df7975903", + "name": "Card 2", + "description": "This is the description of Card 2.", + "closed": False, + "labels": [{"name": "Medium"}, {"name": "Task"}], + "due_date": "", + "url": "https://trello.com/card/45650aca6952888df7975903", + "list_id": "555505a695ff925f9fb1b13d", + "checklists": [], + "comments": [], + }, + { + "id": "55550aca6952888df7975903", + "name": "Camera", + "description": "
", + "closed": False, + "labels": [{"name": "Task"}], + "due_date": "", + "url": "https://trello.com/card/55550aca6952888df7975903", + "list_id": "555505a3427fd688c1ca5ebd", + "checklists": [ + { + "name": "Tasks", + "items": [ + {"name": "Zoom", "state": "complete"}, + {"name": "Follow players", "state": "complete"}, + { + "name": "camera limit to stage size", + "state": "complete", + }, + {"name": "Post Processing effects", "state": "complete"}, + { + "name": "Shitch to universal render pipeline", + "state": "complete", + }, + ], + }, + ], + "comments": [ + { + "data": { + "text": ( + "to follow group of players use Group Camera feature of " + "cinemachine." + ) + } + }, + { + "data": { + "text": "Use 'Impulse' Cinemachine feature for camera shake." + } + }, + {"data": {"text": "depth of field with custom shader."}}, + ], + }, +] + + +@pytest.fixture +def mock_trello_client() -> Any: + """Fixture that creates a mock for trello.TrelloClient.""" + # Create a mock `trello.TrelloClient` object. + with patch("trello.TrelloClient") as mock_trello_client: + # Create a mock list of trello list (columns in the UI). + + # The trello client returns a hierarchy mix of objects and dictionaries. + list_objs = list_to_objects(TRELLO_LISTS) + cards_qa_objs = card_list_to_objects(TRELLO_CARDS_QA) + boards = [ + MockBoard("5555eaafea917522902a2a2c", "Research", [], list_objs), + MockBoard("55559f6002dd973ad8cdbfb7", "QA", cards_qa_objs, list_objs), + ] + + # Patch `get_boards()` method of the mock `TrelloClient` object to return the + # mock list of boards. + mock_trello_client.return_value.list_boards.return_value = boards + yield mock_trello_client.return_value + + +@pytest.mark.usefixtures("mock_trello_client") +@pytest.mark.requires("trello", "bs4", "lxml") +class TestTrelloLoader(unittest.TestCase): + def test_empty_board(self) -> None: + """ + Test loading a board with no cards. + """ + trello_loader = TrelloLoader.from_credentials( + "Research", + api_key="API_KEY", + token="API_TOKEN", + ) + documents = trello_loader.load() + self.assertEqual(len(documents), 0, "Empty board returns an empty list.") + + def test_complete_text_and_metadata(self) -> None: + """ + Test loading a board cards with all metadata. + """ + from bs4 import BeautifulSoup + + trello_loader = TrelloLoader.from_credentials( + "QA", + api_key="API_KEY", + token="API_TOKEN", + ) + documents = trello_loader.load() + self.assertEqual(len(documents), len(TRELLO_CARDS_QA), "Card count matches.") + + soup = BeautifulSoup(documents[0].page_content, "html.parser") + self.assertTrue( + len(soup.find_all()) == 0, + "There is not markup in Closed Card document content.", + ) + + # Check samples of every field type is present in page content. + texts = [ + "Closed Card Title", + "This is the description of Closed Card.", + "Checklist 1", + "Item 1:pending", + "This is a comment on a Closed Card.", + ] + for text in texts: + self.assertTrue(text in documents[0].page_content) + + # Check all metadata is present in first Card + self.assertEqual( + documents[0].metadata, + { + "title": "Closed Card Title", + "id": "12350aca6952888df7975903", + "url": "https://trello.com/card/12350aca6952888df7975903", + "labels": [], + "list": "Done", + "closed": True, + "due_date": "", + }, + "Metadata of Closed Card Matches.", + ) + + soup = BeautifulSoup(documents[1].page_content, "html.parser") + self.assertTrue( + len(soup.find_all()) == 0, + "There is not markup in Card 2 document content.", + ) + + # Check samples of every field type is present in page content. + texts = [ + "Card 2", + "This is the description of Card 2.", + ] + for text in texts: + self.assertTrue(text in documents[1].page_content) + + # Check all metadata is present in second Card + self.assertEqual( + documents[1].metadata, + { + "title": "Card 2", + "id": "45650aca6952888df7975903", + "url": "https://trello.com/card/45650aca6952888df7975903", + "labels": ["Medium", "Task"], + "list": "In Progress", + "closed": False, + "due_date": "", + }, + "Metadata of Card 2 Matches.", + ) + + soup = BeautifulSoup(documents[2].page_content, "html.parser") + self.assertTrue( + len(soup.find_all()) == 0, + "There is not markup in Card 2 document content.", + ) + + # Check samples of every field type is present in page content. + texts = [ + "Camera", + "camera limit to stage size:complete", + "Use 'Impulse' Cinemachine feature for camera shake.", + ] + + for text in texts: + self.assertTrue(text in documents[2].page_content, text + " is present.") + + # Check all metadata is present in second Card + self.assertEqual( + documents[2].metadata, + { + "title": "Camera", + "id": "55550aca6952888df7975903", + "url": "https://trello.com/card/55550aca6952888df7975903", + "labels": ["Task"], + "list": "Selected for Milestone", + "closed": False, + "due_date": "", + }, + "Metadata of Camera Card matches.", + ) + + def test_partial_text_and_metadata(self) -> None: + """ + Test loading a board cards removing some text and metadata. + """ + trello_loader = TrelloLoader.from_credentials( + "QA", + api_key="API_KEY", + token="API_TOKEN", + extra_metadata=("list"), + include_card_name=False, + include_checklist=False, + include_comments=False, + ) + documents = trello_loader.load() + + # Check samples of every field type is present in page content. + texts = [ + "Closed Card Title", + "Checklist 1", + "Item 1:pending", + "This is a comment on a Closed Card.", + ] + for text in texts: + self.assertFalse(text in documents[0].page_content) + + # Check all metadata is present in first Card + self.assertEqual( + documents[0].metadata, + { + "title": "Closed Card Title", + "id": "12350aca6952888df7975903", + "url": "https://trello.com/card/12350aca6952888df7975903", + "list": "Done", + }, + "Metadata of Closed Card Matches.", + )