|
|
@ -156,6 +156,7 @@ class ConfluenceLoader(BaseLoader):
|
|
|
|
page_ids: Optional[List[str]] = None,
|
|
|
|
page_ids: Optional[List[str]] = None,
|
|
|
|
label: Optional[str] = None,
|
|
|
|
label: Optional[str] = None,
|
|
|
|
cql: Optional[str] = None,
|
|
|
|
cql: Optional[str] = None,
|
|
|
|
|
|
|
|
include_restricted_content: bool = False,
|
|
|
|
include_archived_content: bool = False,
|
|
|
|
include_archived_content: bool = False,
|
|
|
|
include_attachments: bool = False,
|
|
|
|
include_attachments: bool = False,
|
|
|
|
include_comments: bool = False,
|
|
|
|
include_comments: bool = False,
|
|
|
@ -171,6 +172,8 @@ class ConfluenceLoader(BaseLoader):
|
|
|
|
:type label: Optional[str], optional
|
|
|
|
:type label: Optional[str], optional
|
|
|
|
:param cql: CQL Expression, defaults to None
|
|
|
|
:param cql: CQL Expression, defaults to None
|
|
|
|
:type cql: Optional[str], optional
|
|
|
|
:type cql: Optional[str], optional
|
|
|
|
|
|
|
|
:param include_restricted_content: defaults to False
|
|
|
|
|
|
|
|
:type include_restricted_content: bool, optional
|
|
|
|
:param include_archived_content: Whether to include archived content,
|
|
|
|
:param include_archived_content: Whether to include archived content,
|
|
|
|
defaults to False
|
|
|
|
defaults to False
|
|
|
|
:type include_archived_content: bool, optional
|
|
|
|
:type include_archived_content: bool, optional
|
|
|
@ -204,9 +207,9 @@ class ConfluenceLoader(BaseLoader):
|
|
|
|
status="any" if include_archived_content else "current",
|
|
|
|
status="any" if include_archived_content else "current",
|
|
|
|
expand="body.storage.value",
|
|
|
|
expand="body.storage.value",
|
|
|
|
)
|
|
|
|
)
|
|
|
|
for page in pages:
|
|
|
|
docs += self.process_pages(
|
|
|
|
doc = self.process_page(page, include_attachments, include_comments)
|
|
|
|
pages, include_restricted_content, include_attachments, include_comments
|
|
|
|
docs.append(doc)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if label:
|
|
|
|
if label:
|
|
|
|
pages = self.paginate_request(
|
|
|
|
pages = self.paginate_request(
|
|
|
@ -216,9 +219,9 @@ class ConfluenceLoader(BaseLoader):
|
|
|
|
max_pages=max_pages,
|
|
|
|
max_pages=max_pages,
|
|
|
|
expand="body.storage.value",
|
|
|
|
expand="body.storage.value",
|
|
|
|
)
|
|
|
|
)
|
|
|
|
for page in pages:
|
|
|
|
docs += self.process_pages(
|
|
|
|
doc = self.process_page(page, include_attachments, include_comments)
|
|
|
|
pages, include_restricted_content, include_attachments, include_comments
|
|
|
|
docs.append(doc)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if cql:
|
|
|
|
if cql:
|
|
|
|
pages = self.paginate_request(
|
|
|
|
pages = self.paginate_request(
|
|
|
@ -229,9 +232,9 @@ class ConfluenceLoader(BaseLoader):
|
|
|
|
include_archived_spaces=include_archived_content,
|
|
|
|
include_archived_spaces=include_archived_content,
|
|
|
|
expand="body.storage.value",
|
|
|
|
expand="body.storage.value",
|
|
|
|
)
|
|
|
|
)
|
|
|
|
for page in pages:
|
|
|
|
docs += self.process_pages(
|
|
|
|
doc = self.process_page(page, include_attachments, include_comments)
|
|
|
|
pages, include_restricted_content, include_attachments, include_comments
|
|
|
|
docs.append(doc)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
if page_ids:
|
|
|
|
if page_ids:
|
|
|
|
for page_id in page_ids:
|
|
|
|
for page_id in page_ids:
|
|
|
@ -248,6 +251,8 @@ class ConfluenceLoader(BaseLoader):
|
|
|
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
|
|
before_sleep=before_sleep_log(logger, logging.WARNING),
|
|
|
|
)(self.confluence.get_page_by_id)
|
|
|
|
)(self.confluence.get_page_by_id)
|
|
|
|
page = get_page(page_id=page_id, expand="body.storage.value")
|
|
|
|
page = get_page(page_id=page_id, expand="body.storage.value")
|
|
|
|
|
|
|
|
if not include_restricted_content and not self.is_public_page(page):
|
|
|
|
|
|
|
|
continue
|
|
|
|
doc = self.process_page(page, include_attachments, include_comments)
|
|
|
|
doc = self.process_page(page, include_attachments, include_comments)
|
|
|
|
docs.append(doc)
|
|
|
|
docs.append(doc)
|
|
|
|
|
|
|
|
|
|
|
@ -295,6 +300,33 @@ class ConfluenceLoader(BaseLoader):
|
|
|
|
docs.extend(batch)
|
|
|
|
docs.extend(batch)
|
|
|
|
return docs[:max_pages]
|
|
|
|
return docs[:max_pages]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_public_page(self, page: dict) -> bool:
|
|
|
|
|
|
|
|
"""Check if a page is publicly accessible."""
|
|
|
|
|
|
|
|
restrictions = self.confluence.get_all_restrictions_for_content(page["id"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return (
|
|
|
|
|
|
|
|
page["status"] == "current"
|
|
|
|
|
|
|
|
and not restrictions["read"]["restrictions"]["user"]["results"]
|
|
|
|
|
|
|
|
and not restrictions["read"]["restrictions"]["group"]["results"]
|
|
|
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def process_pages(
|
|
|
|
|
|
|
|
self,
|
|
|
|
|
|
|
|
pages: List[dict],
|
|
|
|
|
|
|
|
include_restricted_content: bool,
|
|
|
|
|
|
|
|
include_attachments: bool,
|
|
|
|
|
|
|
|
include_comments: bool,
|
|
|
|
|
|
|
|
) -> List[Document]:
|
|
|
|
|
|
|
|
"""Process a list of pages into a list of documents."""
|
|
|
|
|
|
|
|
docs = []
|
|
|
|
|
|
|
|
for page in pages:
|
|
|
|
|
|
|
|
if not include_restricted_content and not self.is_public_page(page):
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
doc = self.process_page(page, include_attachments, include_comments)
|
|
|
|
|
|
|
|
docs.append(doc)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return docs
|
|
|
|
|
|
|
|
|
|
|
|
def process_page(
|
|
|
|
def process_page(
|
|
|
|
self,
|
|
|
|
self,
|
|
|
|
page: dict,
|
|
|
|
page: dict,
|
|
|
|