From afd3e70ae5bc0dff78e2e3240924b6c82d7f09c0 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Mon, 17 Apr 2023 20:23:45 -0700 Subject: [PATCH] Harrison/confluent loader (#2994) Co-authored-by: Justin Flick --- .../examples/confluence.ipynb | 57 +++ langchain/document_loaders/confluence.py | 436 ++++++++++++++++++ poetry.lock | 59 ++- pyproject.toml | 3 + .../document_loaders/test_confluence.py | 39 ++ 5 files changed, 589 insertions(+), 5 deletions(-) create mode 100644 docs/modules/indexes/document_loaders/examples/confluence.ipynb create mode 100644 langchain/document_loaders/confluence.py create mode 100644 tests/integration_tests/document_loaders/test_confluence.py diff --git a/docs/modules/indexes/document_loaders/examples/confluence.ipynb b/docs/modules/indexes/document_loaders/examples/confluence.ipynb new file mode 100644 index 00000000..59e9a4f7 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/confluence.ipynb @@ -0,0 +1,57 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Confluence\n", + "\n", + "A loader for Confluence pages. Port of https://llamahub.ai/l/confluence\n", + "\n", + "This currently supports both username/api_key and Oauth2 login.\n", + "\n", + "Specify a list page_ids and/or space_key to load in the corresponding pages into Document objects, if both are specified the union of both sets will be returned.\n", + "\n", + "You can also specify a boolean `include_attachments` to include attachments, this is set to False by default, if set to True all attachments will be downloaded and ConfluenceReader will extract the text from the attachments and add it to the Document object. Currently supported attachment types are: PDF, PNG, JPEG/JPG, SVG, Word and Excel.\n", + "\n", + "Hint: space_key and page_id can both be found in the URL of a page in Confluence - https://yoursite.atlassian.com/wiki/spaces//pages/\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import ConfluenceLoader\n", + "\n", + "loader = ConfluenceLoader(\n", + " url=\"https://yoursite.atlassian.com/wiki\",\n", + " username=\"me\",\n", + " api_key=\"12345\"\n", + ")\n", + "documents = loader.load(space_key=\"SPACE\", include_attachments=True, limit=50)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.7 (default, Mar 5 2023, 20:59:52) \n[Clang 12.0.0 (clang-1200.0.32.2)]" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "cc99336516f23363341912c6723b01ace86f02e26b4290be1efc0677e2e2ec24" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/langchain/document_loaders/confluence.py b/langchain/document_loaders/confluence.py new file mode 100644 index 00000000..b8b82180 --- /dev/null +++ b/langchain/document_loaders/confluence.py @@ -0,0 +1,436 @@ +"""Load Data from a Confluence Space""" +from typing import Any, Callable, List, Optional, Union + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class ConfluenceLoader(BaseLoader): + """ + Load Confluence pages. Port of https://llamahub.ai/l/confluence + This currently supports both username/api_key and Oauth2 login. + + Specify a list page_ids and/or space_key to load in the corresponding pages into + Document objects, if both are specified the union of both sets will be returned. + + You can also specify a boolean `include_attachments` to include attachments, this + is set to False by default, if set to True all attachments will be downloaded and + ConfluenceReader will extract the text from the attachments and add it to the + Document object. Currently supported attachment types are: PDF, PNG, JPEG/JPG, + SVG, Word and Excel. + + Hint: space_key and page_id can both be found in the URL of a page in Confluence + - https://yoursite.atlassian.com/wiki/spaces//pages/ + + Example: + .. code-block:: python + + from langchain.document_loaders import ConfluenceLoader + + loader = ConfluenceLoader( + url="https://yoursite.atlassian.com/wiki", + username="me", + api_key="12345" + ) + documents = loader.load(space_key="SPACE",limit=50) + + :param url: _description_ + :type url: str + :param api_key: _description_, defaults to None + :type api_key: str, optional + :param username: _description_, defaults to None + :type username: str, optional + :param oauth2: _description_, defaults to {} + :type oauth2: dict, optional + :param cloud: _description_, defaults to True + :type cloud: bool, optional + :raises ValueError: _description_ + :raises ImportError: _description_ + """ + + def __init__( + self, + url: str, + api_key: Optional[str] = None, + username: Optional[str] = None, + oauth2: Optional[dict] = None, + cloud: bool = True, + ): + errors = ConfluenceLoader.validate_init_args(url, api_key, username, oauth2) + if errors: + raise ValueError(f"Error(s) while validating input: {errors}") + + self.base_url = url + + try: + from atlassian import Confluence # noqa: F401 + except ImportError: + raise ImportError( + "`atlassian` package not found, please run" + "`pip install atlassian-python-api`" + ) + + if oauth2: + self.confluence = Confluence(url=url, oauth2=oauth2, cloud=cloud) + else: + self.confluence = Confluence( + url=url, username=username, password=api_key, cloud=cloud + ) + + @staticmethod + def validate_init_args( + url: Optional[str] = None, + api_key: Optional[str] = None, + username: Optional[str] = None, + oauth2: Optional[dict] = None, + ) -> Union[List, None]: + """Validates proper combinations of init arguments""" + + errors = [] + if url is None: + errors.append("Must provide `base_url`") + + if (api_key and not username) or (username and not api_key): + errors.append( + "If one of `api_key` or `username` is provided," + "the other must be as well." + ) + + if (api_key or username) and oauth2: + errors.append( + "Cannot provide a value for `api_key` and/or" + "`username` and provide a value for `oauth2`" + ) + + if oauth2 and oauth2.keys() != [ + "access_token", + "access_token_secret", + "consumer_key", + "key_cert", + ]: + errors.append( + "You have either ommited require keys or added extra" + "keys to the oauth2 dictionary. key values should be" + "`['access_token', 'access_token_secret', 'consumer_key', 'key_cert']`" + ) + + if errors: + return errors + return None + + def load( + self, + space_key: Optional[str] = None, + page_ids: Optional[List[str]] = None, + label: Optional[str] = None, + cql: Optional[str] = None, + include_attachments: bool = False, + limit: Optional[int] = 50, + ) -> List[Document]: + """ + :param space_key: Space key retrieved from a confluence URL, defaults to None + :type space_key: Optional[str], optional + :param page_ids: List of specific page IDs to load, defaults to None + :type page_ids: Optional[List[str]], optional + :param label: Get all pages with this label, defaults to None + :type label: Optional[str], optional + :param cql: CQL Expression, defaults to None + :type cql: Optional[str], optional + :param include_attachments: defaults to False + :type include_attachments: bool, optional + :param limit: Maximum number of pages to retrieve, defaults to 50 + :type limit: int, optional + :raises ValueError: _description_ + :raises ImportError: _description_ + :return: _description_ + :rtype: List[Document] + """ + if not space_key and not page_ids and not label and not cql: + raise ValueError( + "Must specify at least one among `space_key`, `page_ids`," + "`label`, `cql` parameters." + ) + + try: + import html2text # type: ignore + except ImportError: + raise ImportError( + "`html2text` package not found, please run `pip install html2text`" + ) + + docs = [] + + text_maker = html2text.HTML2Text() + text_maker.ignore_links = True + text_maker.ignore_images = True + + if space_key: + pages = self.paginate_request( + self.confluence.get_all_pages_from_space, + space=space_key, + limit=limit, + expand="body.storage.value", + ) + for page in pages: + doc = self.process_page(page, include_attachments, text_maker) + docs.append(doc) + + if label: + pages = self.paginate_request( + self.confluence.get_all_pages_by_label, + label=label, + limit=limit, + expand="body.storage.value", + ) + for page in pages: + doc = self.process_page(page, include_attachments, text_maker) + docs.append(doc) + + if cql: + pages = self.paginate_request( + self.confluence.cql, cql=cql, limit=limit, expand="body.storage.value" + ) + for page in pages: + doc = self.process_page(page, include_attachments, text_maker) + docs.append(doc) + + if page_ids: + for page_id in page_ids: + page = self.confluence.get_page_by_id( + page_id=page_id, expand="body.storage.value" + ) + doc = self.process_page(page, include_attachments, text_maker) + docs.append(doc) + + return docs + + def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List: + """Paginate the various methods to retrieve groups of pages. + + Unforunately, due to page size, sometimes the Confluence API + doesn't match the limit value. Also, due to the Atlassian Python + package, we don't get the "next" values from the "_links" key because + they only return the value from the results key. So here, the pagination + starts from 0 and goes until the limit. We have to manually check if there + are more docs based on the length of the returned list of pages, rather than + just checking for the presence of a `next` key in the response like this page + would have you do: + https://developer.atlassian.com/server/confluence/pagination-in-the-rest-api/ + + :param retrieval_method: Function used to retrieve docs + :type retrieval_method: callable + :return: List of documents + :rtype: List + """ + + limit = kwargs["limit"] + page = 0 + docs = [] + while page < limit: + batch = retrieval_method(**kwargs, start=page) + if len(batch) < limit: + page = limit + else: + page += len(batch) + docs.extend(batch) + return docs + + def process_page( + self, page: dict, include_attachments: bool, text_maker: Any + ) -> Document: + if include_attachments: + attachment_texts = self.process_attachment(page["id"]) + else: + attachment_texts = [] + text = text_maker.handle(page["body"]["storage"]["value"]) + "".join( + attachment_texts + ) + return Document( + page_content=text, metadata={"title": page["title"], "id": page["id"]} + ) + + def process_attachment(self, page_id: str) -> List[str]: + try: + import requests # noqa: F401 + from PIL import Image # noqa: F401 + except ImportError: + raise ImportError( + "`pytesseract` or `pdf2image` or `Pillow` package not found," + "please run `pip install pytesseract pdf2image Pillow`" + ) + + # depending on setup you may also need to set the correct path for + # poppler and tesseract + attachments = self.confluence.get_attachments_from_content(page_id)["results"] + texts = [] + for attachment in attachments: + media_type = attachment["metadata"]["mediaType"] + absolute_url = self.base_url + attachment["_links"]["download"] + title = attachment["title"] + if media_type == "application/pdf": + text = title + self.process_pdf(absolute_url) + elif ( + media_type == "image/png" + or media_type == "image/jpg" + or media_type == "image/jpeg" + ): + text = title + self.process_image(absolute_url) + elif ( + media_type == "application/vnd.openxmlformats-officedocument" + ".wordprocessingml.document" + ): + text = title + self.process_doc(absolute_url) + elif media_type == "application/vnd.ms-excel": + text = title + self.process_xls(absolute_url) + elif media_type == "image/svg+xml": + text = title + self.process_svg(absolute_url) + else: + continue + texts.append(text) + + return texts + + def process_pdf(self, link: str) -> str: + try: + import pytesseract # noqa: F401 + from pdf2image import convert_from_bytes # noqa: F401 + except ImportError: + raise ImportError( + "`pytesseract` or `pdf2image` package not found," + "please run `pip install pytesseract pdf2image`" + ) + + import pytesseract # noqa: F811 + from pdf2image import convert_from_bytes # noqa: F811 + + response = self.confluence.request(path=link, absolute=True) + text = "" + + if ( + response.status_code != 200 + or response.content == b"" + or response.content is None + ): + return text + try: + images = convert_from_bytes(response.content) + except ValueError: + return text + + for i, image in enumerate(images): + image_text = pytesseract.image_to_string(image) + text += f"Page {i + 1}:\n{image_text}\n\n" + + return text + + def process_image(self, link: str) -> str: + try: + from io import BytesIO # noqa: F401 + + import pytesseract # noqa: F401 + from PIL import Image # noqa: F401 + except ImportError: + raise ImportError( + "`pytesseract` or `Pillow` package not found," + "please run `pip install pytesseract Pillow`" + ) + + response = self.confluence.request(path=link, absolute=True) + text = "" + + if ( + response.status_code != 200 + or response.content == b"" + or response.content is None + ): + return text + try: + image = Image.open(BytesIO(response.content)) + except OSError: + return text + + return pytesseract.image_to_string(image) + + def process_doc(self, link: str) -> str: + try: + from io import BytesIO # noqa: F401 + + import docx2txt # noqa: F401 + except ImportError: + raise ImportError( + "`docx2txt` package not found, please run `pip install docx2txt`" + ) + + response = self.confluence.request(path=link, absolute=True) + text = "" + + if ( + response.status_code != 200 + or response.content == b"" + or response.content is None + ): + return text + file_data = BytesIO(response.content) + + return docx2txt.process(file_data) + + def process_xls(self, link: str) -> str: + try: + import xlrd # noqa: F401 + except ImportError: + raise ImportError("`xlrd` package not found, please run `pip install xlrd`") + + response = self.confluence.request(path=link, absolute=True) + text = "" + + if ( + response.status_code != 200 + or response.content == b"" + or response.content is None + ): + return text + + workbook = xlrd.open_workbook(file_contents=response.content) + for sheet in workbook.sheets(): + text += f"{sheet.name}:\n" + for row in range(sheet.nrows): + for col in range(sheet.ncols): + text += f"{sheet.cell_value(row, col)}\t" + text += "\n" + text += "\n" + + return text + + def process_svg(self, link: str) -> str: + try: + from io import BytesIO # noqa: F401 + + import pytesseract # noqa: F401 + from PIL import Image # noqa: F401 + from reportlab.graphics import renderPM # noqa: F401 + from reportlab.graphics.shapes import Drawing # noqa: F401 + from svglib.svglib import svg2rlg # noqa: F401 + except ImportError: + raise ImportError( + "`pytesseract`, `Pillow`, or `svglib` package not found," + "please run `pip install pytesseract Pillow svglib`" + ) + + response = self.confluence.request(path=link, absolute=True) + text = "" + + if ( + response.status_code != 200 + or response.content == b"" + or response.content is None + ): + return text + + drawing = svg2rlg(BytesIO(response.content)) + + img_data = BytesIO() + renderPM.drawToFile(drawing, img_data, fmt="PNG") + img_data.seek(0) + image = Image.open(img_data) + + return pytesseract.image_to_string(image) diff --git a/poetry.lock b/poetry.lock index 8b7262a1..fb0c7f1f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. +# This file is automatically @generated by Poetry and should not be changed by hand. [[package]] name = "absl-py" @@ -475,6 +475,27 @@ files = [ {file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"}, ] +[[package]] +name = "atlassian-python-api" +version = "3.36.0" +description = "Python Atlassian REST API Wrapper" +category = "main" +optional = true +python-versions = "*" +files = [ + {file = "atlassian-python-api-3.36.0.tar.gz", hash = "sha256:b1c1f10232818ee3f7e5f59417589971d6f538d12aa79a9784dea09263cf7322"}, +] + +[package.dependencies] +deprecated = "*" +oauthlib = "*" +requests = "*" +requests_oauthlib = "*" +six = "*" + +[package.extras] +kerberos = ["requests-kerberos"] + [[package]] name = "attrs" version = "22.2.0" @@ -2400,6 +2421,18 @@ files = [ {file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"}, ] +[[package]] +name = "html2text" +version = "2020.1.16" +description = "Turn HTML into equivalent Markdown-structured text." +category = "main" +optional = true +python-versions = ">=3.5" +files = [ + {file = "html2text-2020.1.16-py3-none-any.whl", hash = "sha256:c7c629882da0cf377d66f073329ccf34a12ed2adf0169b9285ae4e63ef54c82b"}, + {file = "html2text-2020.1.16.tar.gz", hash = "sha256:e296318e16b059ddb97f7a8a1d6a5c1d7af4544049a01e261731d2d5cc277bbb"}, +] + [[package]] name = "httpcore" version = "0.17.0" @@ -5906,6 +5939,22 @@ files = [ {file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"}, ] +[[package]] +name = "pytesseract" +version = "0.3.10" +description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR" +category = "main" +optional = true +python-versions = ">=3.7" +files = [ + {file = "pytesseract-0.3.10-py3-none-any.whl", hash = "sha256:8f22cc98f765bf13517ead0c70effedb46c153540d25783e04014f28b55a5fc6"}, + {file = "pytesseract-0.3.10.tar.gz", hash = "sha256:f1c3a8b0f07fd01a1085d451f5b8315be6eec1d5577a6796d46dc7a62bd4120f"}, +] + +[package.dependencies] +packaging = ">=21.3" +Pillow = ">=8.0.0" + [[package]] name = "pytest" version = "7.3.1" @@ -7301,7 +7350,7 @@ files = [ ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and platform_machine == \"aarch64\" or python_version >= \"3\" and platform_machine == \"ppc64le\" or python_version >= \"3\" and platform_machine == \"x86_64\" or python_version >= \"3\" and platform_machine == \"amd64\" or python_version >= \"3\" and platform_machine == \"AMD64\" or python_version >= \"3\" and platform_machine == \"win32\" or python_version >= \"3\" and platform_machine == \"WIN32\""} +greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} [package.extras] aiomysql = ["aiomysql", "greenlet (!=0.4.17)"] @@ -9069,13 +9118,13 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["aleph-alpha-client", "anthropic", "beautifulsoup4", "cohere", "deeplake", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "huggingface_hub", "jina", "jinja2", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] +all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm"] cohere = ["cohere"] -llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] +llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] openai = ["openai"] qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "47ad0cfafaf5ec6f27bd1713ac237077cd54083960e937ebd005a7c4b25bbe5e" +content-hash = "f563ddd77272f04b687d9f7e3b97bd39513b38584e271ba5315b85191360cf7a" diff --git a/pyproject.toml b/pyproject.toml index 8d376e57..5801640a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -60,6 +60,9 @@ psycopg2-binary = {version = "^2.9.5", optional = true} pyowm = {version = "^3.3.0", optional = true} async-timeout = {version = "^4.0.0", python = "<3.11"} gptcache = {version = ">=0.1.7", optional = true} +atlassian-python-api = {version = "^3.36.0", optional=true} +pytesseract = {version = "^0.3.10", optional=true} +html2text = {version="^2020.1.16", optional=true} numexpr = "^2.8.4" [tool.poetry.group.docs.dependencies] diff --git a/tests/integration_tests/document_loaders/test_confluence.py b/tests/integration_tests/document_loaders/test_confluence.py new file mode 100644 index 00000000..211e165f --- /dev/null +++ b/tests/integration_tests/document_loaders/test_confluence.py @@ -0,0 +1,39 @@ +import pytest + +from langchain.document_loaders.confluence import ConfluenceLoader + +try: + from atlassian import Confluence # noqa: F401 + + confluence_installed = True +except ImportError: + confluence_installed = False + + +@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed") +def test_load_single_confluence_page() -> None: + loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/") + docs = loader.load(page_ids=["33189"]) + + assert len(docs) == 1 + assert docs[0].page_content is not None + assert docs[0].metadata["id"] == "33189" + assert docs[0].metadata["title"] == "An easy intro to using Confluence" + + +@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed") +def test_load_full_confluence_space() -> None: + loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/") + docs = loader.load(space_key="RD") + + assert len(docs) == 14 + assert docs[0].page_content is not None + + +@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed") +def test_confluence_pagination() -> None: + loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/") + docs = loader.load(space_key="RD", limit=5) + + assert len(docs) == 5 + assert docs[0].page_content is not None