Harrison/confluence stuff (#3765)

Co-authored-by: Jelmer Borst <japborst@gmail.com>
fix_agent_callbacks
Harrison Chase 1 year ago committed by GitHub
parent 5998b53596
commit 15b92d361d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -156,6 +156,7 @@ class ConfluenceLoader(BaseLoader):
page_ids: Optional[List[str]] = None,
label: Optional[str] = None,
cql: Optional[str] = None,
include_restricted_content: bool = False,
include_archived_content: bool = False,
include_attachments: bool = False,
include_comments: bool = False,
@ -171,6 +172,8 @@ class ConfluenceLoader(BaseLoader):
:type label: Optional[str], optional
:param cql: CQL Expression, defaults to None
:type cql: Optional[str], optional
:param include_restricted_content: defaults to False
:type include_restricted_content: bool, optional
:param include_archived_content: Whether to include archived content,
defaults to False
:type include_archived_content: bool, optional
@ -204,9 +207,9 @@ class ConfluenceLoader(BaseLoader):
status="any" if include_archived_content else "current",
expand="body.storage.value",
)
for page in pages:
doc = self.process_page(page, include_attachments, include_comments)
docs.append(doc)
docs += self.process_pages(
pages, include_restricted_content, include_attachments, include_comments
)
if label:
pages = self.paginate_request(
@ -216,9 +219,9 @@ class ConfluenceLoader(BaseLoader):
max_pages=max_pages,
expand="body.storage.value",
)
for page in pages:
doc = self.process_page(page, include_attachments, include_comments)
docs.append(doc)
docs += self.process_pages(
pages, include_restricted_content, include_attachments, include_comments
)
if cql:
pages = self.paginate_request(
@ -229,9 +232,9 @@ class ConfluenceLoader(BaseLoader):
include_archived_spaces=include_archived_content,
expand="body.storage.value",
)
for page in pages:
doc = self.process_page(page, include_attachments, include_comments)
docs.append(doc)
docs += self.process_pages(
pages, include_restricted_content, include_attachments, include_comments
)
if page_ids:
for page_id in page_ids:
@ -248,6 +251,8 @@ class ConfluenceLoader(BaseLoader):
before_sleep=before_sleep_log(logger, logging.WARNING),
)(self.confluence.get_page_by_id)
page = get_page(page_id=page_id, expand="body.storage.value")
if not include_restricted_content and not self.is_public_page(page):
continue
doc = self.process_page(page, include_attachments, include_comments)
docs.append(doc)
@ -295,6 +300,33 @@ class ConfluenceLoader(BaseLoader):
docs.extend(batch)
return docs[:max_pages]
def is_public_page(self, page: dict) -> bool:
"""Check if a page is publicly accessible."""
restrictions = self.confluence.get_all_restrictions_for_content(page["id"])
return (
page["status"] == "current"
and not restrictions["read"]["restrictions"]["user"]["results"]
and not restrictions["read"]["restrictions"]["group"]["results"]
)
def process_pages(
self,
pages: List[dict],
include_restricted_content: bool,
include_attachments: bool,
include_comments: bool,
) -> List[Document]:
"""Process a list of pages into a list of documents."""
docs = []
for page in pages:
if not include_restricted_content and not self.is_public_page(page):
continue
doc = self.process_page(page, include_attachments, include_comments)
docs.append(doc)
return docs
def process_page(
self,
page: dict,

Loading…
Cancel
Save