Harrison/confluent loader (#2994)

Co-authored-by: Justin Flick <Justinjayflick@gmail.com>
fix_agent_callbacks
Harrison Chase 1 year ago committed by GitHub
parent 95d578d246
commit afd3e70ae5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,57 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Confluence\n",
"\n",
"A loader for Confluence pages. Port of https://llamahub.ai/l/confluence\n",
"\n",
"This currently supports both username/api_key and Oauth2 login.\n",
"\n",
"Specify a list page_ids and/or space_key to load in the corresponding pages into Document objects, if both are specified the union of both sets will be returned.\n",
"\n",
"You can also specify a boolean `include_attachments` to include attachments, this is set to False by default, if set to True all attachments will be downloaded and ConfluenceReader will extract the text from the attachments and add it to the Document object. Currently supported attachment types are: PDF, PNG, JPEG/JPG, SVG, Word and Excel.\n",
"\n",
"Hint: space_key and page_id can both be found in the URL of a page in Confluence - https://yoursite.atlassian.com/wiki/spaces/<space_key>/pages/<page_id>\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import ConfluenceLoader\n",
"\n",
"loader = ConfluenceLoader(\n",
" url=\"https://yoursite.atlassian.com/wiki\",\n",
" username=\"me\",\n",
" api_key=\"12345\"\n",
")\n",
"documents = loader.load(space_key=\"SPACE\", include_attachments=True, limit=50)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.9.7 (default, Mar 5 2023, 20:59:52) \n[Clang 12.0.0 (clang-1200.0.32.2)]"
},
"orig_nbformat": 4,
"vscode": {
"interpreter": {
"hash": "cc99336516f23363341912c6723b01ace86f02e26b4290be1efc0677e2e2ec24"
}
}
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -0,0 +1,436 @@
"""Load Data from a Confluence Space"""
from typing import Any, Callable, List, Optional, Union
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
class ConfluenceLoader(BaseLoader):
"""
Load Confluence pages. Port of https://llamahub.ai/l/confluence
This currently supports both username/api_key and Oauth2 login.
Specify a list page_ids and/or space_key to load in the corresponding pages into
Document objects, if both are specified the union of both sets will be returned.
You can also specify a boolean `include_attachments` to include attachments, this
is set to False by default, if set to True all attachments will be downloaded and
ConfluenceReader will extract the text from the attachments and add it to the
Document object. Currently supported attachment types are: PDF, PNG, JPEG/JPG,
SVG, Word and Excel.
Hint: space_key and page_id can both be found in the URL of a page in Confluence
- https://yoursite.atlassian.com/wiki/spaces/<space_key>/pages/<page_id>
Example:
.. code-block:: python
from langchain.document_loaders import ConfluenceLoader
loader = ConfluenceLoader(
url="https://yoursite.atlassian.com/wiki",
username="me",
api_key="12345"
)
documents = loader.load(space_key="SPACE",limit=50)
:param url: _description_
:type url: str
:param api_key: _description_, defaults to None
:type api_key: str, optional
:param username: _description_, defaults to None
:type username: str, optional
:param oauth2: _description_, defaults to {}
:type oauth2: dict, optional
:param cloud: _description_, defaults to True
:type cloud: bool, optional
:raises ValueError: _description_
:raises ImportError: _description_
"""
def __init__(
self,
url: str,
api_key: Optional[str] = None,
username: Optional[str] = None,
oauth2: Optional[dict] = None,
cloud: bool = True,
):
errors = ConfluenceLoader.validate_init_args(url, api_key, username, oauth2)
if errors:
raise ValueError(f"Error(s) while validating input: {errors}")
self.base_url = url
try:
from atlassian import Confluence # noqa: F401
except ImportError:
raise ImportError(
"`atlassian` package not found, please run"
"`pip install atlassian-python-api`"
)
if oauth2:
self.confluence = Confluence(url=url, oauth2=oauth2, cloud=cloud)
else:
self.confluence = Confluence(
url=url, username=username, password=api_key, cloud=cloud
)
@staticmethod
def validate_init_args(
url: Optional[str] = None,
api_key: Optional[str] = None,
username: Optional[str] = None,
oauth2: Optional[dict] = None,
) -> Union[List, None]:
"""Validates proper combinations of init arguments"""
errors = []
if url is None:
errors.append("Must provide `base_url`")
if (api_key and not username) or (username and not api_key):
errors.append(
"If one of `api_key` or `username` is provided,"
"the other must be as well."
)
if (api_key or username) and oauth2:
errors.append(
"Cannot provide a value for `api_key` and/or"
"`username` and provide a value for `oauth2`"
)
if oauth2 and oauth2.keys() != [
"access_token",
"access_token_secret",
"consumer_key",
"key_cert",
]:
errors.append(
"You have either ommited require keys or added extra"
"keys to the oauth2 dictionary. key values should be"
"`['access_token', 'access_token_secret', 'consumer_key', 'key_cert']`"
)
if errors:
return errors
return None
def load(
self,
space_key: Optional[str] = None,
page_ids: Optional[List[str]] = None,
label: Optional[str] = None,
cql: Optional[str] = None,
include_attachments: bool = False,
limit: Optional[int] = 50,
) -> List[Document]:
"""
:param space_key: Space key retrieved from a confluence URL, defaults to None
:type space_key: Optional[str], optional
:param page_ids: List of specific page IDs to load, defaults to None
:type page_ids: Optional[List[str]], optional
:param label: Get all pages with this label, defaults to None
:type label: Optional[str], optional
:param cql: CQL Expression, defaults to None
:type cql: Optional[str], optional
:param include_attachments: defaults to False
:type include_attachments: bool, optional
:param limit: Maximum number of pages to retrieve, defaults to 50
:type limit: int, optional
:raises ValueError: _description_
:raises ImportError: _description_
:return: _description_
:rtype: List[Document]
"""
if not space_key and not page_ids and not label and not cql:
raise ValueError(
"Must specify at least one among `space_key`, `page_ids`,"
"`label`, `cql` parameters."
)
try:
import html2text # type: ignore
except ImportError:
raise ImportError(
"`html2text` package not found, please run `pip install html2text`"
)
docs = []
text_maker = html2text.HTML2Text()
text_maker.ignore_links = True
text_maker.ignore_images = True
if space_key:
pages = self.paginate_request(
self.confluence.get_all_pages_from_space,
space=space_key,
limit=limit,
expand="body.storage.value",
)
for page in pages:
doc = self.process_page(page, include_attachments, text_maker)
docs.append(doc)
if label:
pages = self.paginate_request(
self.confluence.get_all_pages_by_label,
label=label,
limit=limit,
expand="body.storage.value",
)
for page in pages:
doc = self.process_page(page, include_attachments, text_maker)
docs.append(doc)
if cql:
pages = self.paginate_request(
self.confluence.cql, cql=cql, limit=limit, expand="body.storage.value"
)
for page in pages:
doc = self.process_page(page, include_attachments, text_maker)
docs.append(doc)
if page_ids:
for page_id in page_ids:
page = self.confluence.get_page_by_id(
page_id=page_id, expand="body.storage.value"
)
doc = self.process_page(page, include_attachments, text_maker)
docs.append(doc)
return docs
def paginate_request(self, retrieval_method: Callable, **kwargs: Any) -> List:
"""Paginate the various methods to retrieve groups of pages.
Unforunately, due to page size, sometimes the Confluence API
doesn't match the limit value. Also, due to the Atlassian Python
package, we don't get the "next" values from the "_links" key because
they only return the value from the results key. So here, the pagination
starts from 0 and goes until the limit. We have to manually check if there
are more docs based on the length of the returned list of pages, rather than
just checking for the presence of a `next` key in the response like this page
would have you do:
https://developer.atlassian.com/server/confluence/pagination-in-the-rest-api/
:param retrieval_method: Function used to retrieve docs
:type retrieval_method: callable
:return: List of documents
:rtype: List
"""
limit = kwargs["limit"]
page = 0
docs = []
while page < limit:
batch = retrieval_method(**kwargs, start=page)
if len(batch) < limit:
page = limit
else:
page += len(batch)
docs.extend(batch)
return docs
def process_page(
self, page: dict, include_attachments: bool, text_maker: Any
) -> Document:
if include_attachments:
attachment_texts = self.process_attachment(page["id"])
else:
attachment_texts = []
text = text_maker.handle(page["body"]["storage"]["value"]) + "".join(
attachment_texts
)
return Document(
page_content=text, metadata={"title": page["title"], "id": page["id"]}
)
def process_attachment(self, page_id: str) -> List[str]:
try:
import requests # noqa: F401
from PIL import Image # noqa: F401
except ImportError:
raise ImportError(
"`pytesseract` or `pdf2image` or `Pillow` package not found,"
"please run `pip install pytesseract pdf2image Pillow`"
)
# depending on setup you may also need to set the correct path for
# poppler and tesseract
attachments = self.confluence.get_attachments_from_content(page_id)["results"]
texts = []
for attachment in attachments:
media_type = attachment["metadata"]["mediaType"]
absolute_url = self.base_url + attachment["_links"]["download"]
title = attachment["title"]
if media_type == "application/pdf":
text = title + self.process_pdf(absolute_url)
elif (
media_type == "image/png"
or media_type == "image/jpg"
or media_type == "image/jpeg"
):
text = title + self.process_image(absolute_url)
elif (
media_type == "application/vnd.openxmlformats-officedocument"
".wordprocessingml.document"
):
text = title + self.process_doc(absolute_url)
elif media_type == "application/vnd.ms-excel":
text = title + self.process_xls(absolute_url)
elif media_type == "image/svg+xml":
text = title + self.process_svg(absolute_url)
else:
continue
texts.append(text)
return texts
def process_pdf(self, link: str) -> str:
try:
import pytesseract # noqa: F401
from pdf2image import convert_from_bytes # noqa: F401
except ImportError:
raise ImportError(
"`pytesseract` or `pdf2image` package not found,"
"please run `pip install pytesseract pdf2image`"
)
import pytesseract # noqa: F811
from pdf2image import convert_from_bytes # noqa: F811
response = self.confluence.request(path=link, absolute=True)
text = ""
if (
response.status_code != 200
or response.content == b""
or response.content is None
):
return text
try:
images = convert_from_bytes(response.content)
except ValueError:
return text
for i, image in enumerate(images):
image_text = pytesseract.image_to_string(image)
text += f"Page {i + 1}:\n{image_text}\n\n"
return text
def process_image(self, link: str) -> str:
try:
from io import BytesIO # noqa: F401
import pytesseract # noqa: F401
from PIL import Image # noqa: F401
except ImportError:
raise ImportError(
"`pytesseract` or `Pillow` package not found,"
"please run `pip install pytesseract Pillow`"
)
response = self.confluence.request(path=link, absolute=True)
text = ""
if (
response.status_code != 200
or response.content == b""
or response.content is None
):
return text
try:
image = Image.open(BytesIO(response.content))
except OSError:
return text
return pytesseract.image_to_string(image)
def process_doc(self, link: str) -> str:
try:
from io import BytesIO # noqa: F401
import docx2txt # noqa: F401
except ImportError:
raise ImportError(
"`docx2txt` package not found, please run `pip install docx2txt`"
)
response = self.confluence.request(path=link, absolute=True)
text = ""
if (
response.status_code != 200
or response.content == b""
or response.content is None
):
return text
file_data = BytesIO(response.content)
return docx2txt.process(file_data)
def process_xls(self, link: str) -> str:
try:
import xlrd # noqa: F401
except ImportError:
raise ImportError("`xlrd` package not found, please run `pip install xlrd`")
response = self.confluence.request(path=link, absolute=True)
text = ""
if (
response.status_code != 200
or response.content == b""
or response.content is None
):
return text
workbook = xlrd.open_workbook(file_contents=response.content)
for sheet in workbook.sheets():
text += f"{sheet.name}:\n"
for row in range(sheet.nrows):
for col in range(sheet.ncols):
text += f"{sheet.cell_value(row, col)}\t"
text += "\n"
text += "\n"
return text
def process_svg(self, link: str) -> str:
try:
from io import BytesIO # noqa: F401
import pytesseract # noqa: F401
from PIL import Image # noqa: F401
from reportlab.graphics import renderPM # noqa: F401
from reportlab.graphics.shapes import Drawing # noqa: F401
from svglib.svglib import svg2rlg # noqa: F401
except ImportError:
raise ImportError(
"`pytesseract`, `Pillow`, or `svglib` package not found,"
"please run `pip install pytesseract Pillow svglib`"
)
response = self.confluence.request(path=link, absolute=True)
text = ""
if (
response.status_code != 200
or response.content == b""
or response.content is None
):
return text
drawing = svg2rlg(BytesIO(response.content))
img_data = BytesIO()
renderPM.drawToFile(drawing, img_data, fmt="PNG")
img_data.seek(0)
image = Image.open(img_data)
return pytesseract.image_to_string(image)

59
poetry.lock generated

@ -1,4 +1,4 @@
# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand.
# This file is automatically @generated by Poetry and should not be changed by hand.
[[package]]
name = "absl-py"
@ -475,6 +475,27 @@ files = [
{file = "async_timeout-4.0.2-py3-none-any.whl", hash = "sha256:8ca1e4fcf50d07413d66d1a5e416e42cfdf5851c981d679a09851a6853383b3c"},
]
[[package]]
name = "atlassian-python-api"
version = "3.36.0"
description = "Python Atlassian REST API Wrapper"
category = "main"
optional = true
python-versions = "*"
files = [
{file = "atlassian-python-api-3.36.0.tar.gz", hash = "sha256:b1c1f10232818ee3f7e5f59417589971d6f538d12aa79a9784dea09263cf7322"},
]
[package.dependencies]
deprecated = "*"
oauthlib = "*"
requests = "*"
requests_oauthlib = "*"
six = "*"
[package.extras]
kerberos = ["requests-kerberos"]
[[package]]
name = "attrs"
version = "22.2.0"
@ -2400,6 +2421,18 @@ files = [
{file = "hpack-4.0.0.tar.gz", hash = "sha256:fc41de0c63e687ebffde81187a948221294896f6bdc0ae2312708df339430095"},
]
[[package]]
name = "html2text"
version = "2020.1.16"
description = "Turn HTML into equivalent Markdown-structured text."
category = "main"
optional = true
python-versions = ">=3.5"
files = [
{file = "html2text-2020.1.16-py3-none-any.whl", hash = "sha256:c7c629882da0cf377d66f073329ccf34a12ed2adf0169b9285ae4e63ef54c82b"},
{file = "html2text-2020.1.16.tar.gz", hash = "sha256:e296318e16b059ddb97f7a8a1d6a5c1d7af4544049a01e261731d2d5cc277bbb"},
]
[[package]]
name = "httpcore"
version = "0.17.0"
@ -5906,6 +5939,22 @@ files = [
{file = "PySocks-1.7.1.tar.gz", hash = "sha256:3f8804571ebe159c380ac6de37643bb4685970655d3bba243530d6558b799aa0"},
]
[[package]]
name = "pytesseract"
version = "0.3.10"
description = "Python-tesseract is a python wrapper for Google's Tesseract-OCR"
category = "main"
optional = true
python-versions = ">=3.7"
files = [
{file = "pytesseract-0.3.10-py3-none-any.whl", hash = "sha256:8f22cc98f765bf13517ead0c70effedb46c153540d25783e04014f28b55a5fc6"},
{file = "pytesseract-0.3.10.tar.gz", hash = "sha256:f1c3a8b0f07fd01a1085d451f5b8315be6eec1d5577a6796d46dc7a62bd4120f"},
]
[package.dependencies]
packaging = ">=21.3"
Pillow = ">=8.0.0"
[[package]]
name = "pytest"
version = "7.3.1"
@ -7301,7 +7350,7 @@ files = [
]
[package.dependencies]
greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and platform_machine == \"aarch64\" or python_version >= \"3\" and platform_machine == \"ppc64le\" or python_version >= \"3\" and platform_machine == \"x86_64\" or python_version >= \"3\" and platform_machine == \"amd64\" or python_version >= \"3\" and platform_machine == \"AMD64\" or python_version >= \"3\" and platform_machine == \"win32\" or python_version >= \"3\" and platform_machine == \"WIN32\""}
greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"}
[package.extras]
aiomysql = ["aiomysql", "greenlet (!=0.4.17)"]
@ -9069,13 +9118,13 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
cffi = ["cffi (>=1.11)"]
[extras]
all = ["aleph-alpha-client", "anthropic", "beautifulsoup4", "cohere", "deeplake", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "huggingface_hub", "jina", "jinja2", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm"]
cohere = ["cohere"]
llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"]
llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"]
openai = ["openai"]
qdrant = ["qdrant-client"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "47ad0cfafaf5ec6f27bd1713ac237077cd54083960e937ebd005a7c4b25bbe5e"
content-hash = "f563ddd77272f04b687d9f7e3b97bd39513b38584e271ba5315b85191360cf7a"

@ -60,6 +60,9 @@ psycopg2-binary = {version = "^2.9.5", optional = true}
pyowm = {version = "^3.3.0", optional = true}
async-timeout = {version = "^4.0.0", python = "<3.11"}
gptcache = {version = ">=0.1.7", optional = true}
atlassian-python-api = {version = "^3.36.0", optional=true}
pytesseract = {version = "^0.3.10", optional=true}
html2text = {version="^2020.1.16", optional=true}
numexpr = "^2.8.4"
[tool.poetry.group.docs.dependencies]

@ -0,0 +1,39 @@
import pytest
from langchain.document_loaders.confluence import ConfluenceLoader
try:
from atlassian import Confluence # noqa: F401
confluence_installed = True
except ImportError:
confluence_installed = False
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
def test_load_single_confluence_page() -> None:
loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/")
docs = loader.load(page_ids=["33189"])
assert len(docs) == 1
assert docs[0].page_content is not None
assert docs[0].metadata["id"] == "33189"
assert docs[0].metadata["title"] == "An easy intro to using Confluence"
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
def test_load_full_confluence_space() -> None:
loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/")
docs = loader.load(space_key="RD")
assert len(docs) == 14
assert docs[0].page_content is not None
@pytest.mark.skipif(not confluence_installed, reason="Atlassian package not installed")
def test_confluence_pagination() -> None:
loader = ConfluenceLoader(url="https://templates.atlassian.net/wiki/")
docs = loader.load(space_key="RD", limit=5)
assert len(docs) == 5
assert docs[0].page_content is not None
Loading…
Cancel
Save