@ -60,6 +60,8 @@ class ConfluenceLoader(BaseLoader):
: type min_retry_seconds : Optional [ int ] , optional
: type min_retry_seconds : Optional [ int ] , optional
: param max_retry_seconds : defaults to 10
: param max_retry_seconds : defaults to 10
: type max_retry_seconds : Optional [ int ] , optional
: type max_retry_seconds : Optional [ int ] , optional
: param confluence_kwargs : additional kwargs to initialize confluence with
: type confluence_kwargs : dict , optional
: raises ValueError : Errors while validating input
: raises ValueError : Errors while validating input
: raises ImportError : Required dependencies not installed .
: raises ImportError : Required dependencies not installed .
"""
"""
@ -74,7 +76,9 @@ class ConfluenceLoader(BaseLoader):
number_of_retries : Optional [ int ] = 3 ,
number_of_retries : Optional [ int ] = 3 ,
min_retry_seconds : Optional [ int ] = 2 ,
min_retry_seconds : Optional [ int ] = 2 ,
max_retry_seconds : Optional [ int ] = 10 ,
max_retry_seconds : Optional [ int ] = 10 ,
confluence_kwargs : Optional [ dict ] = None ,
) :
) :
confluence_kwargs = confluence_kwargs or { }
errors = ConfluenceLoader . validate_init_args ( url , api_key , username , oauth2 )
errors = ConfluenceLoader . validate_init_args ( url , api_key , username , oauth2 )
if errors :
if errors :
raise ValueError ( f " Error(s) while validating input: { errors } " )
raise ValueError ( f " Error(s) while validating input: { errors } " )
@ -93,10 +97,16 @@ class ConfluenceLoader(BaseLoader):
)
)
if oauth2 :
if oauth2 :
self . confluence = Confluence ( url = url , oauth2 = oauth2 , cloud = cloud )
self . confluence = Confluence (
url = url , oauth2 = oauth2 , cloud = cloud , * * confluence_kwargs
)
else :
else :
self . confluence = Confluence (
self . confluence = Confluence (
url = url , username = username , password = api_key , cloud = cloud
url = url ,
username = username ,
password = api_key ,
cloud = cloud ,
* * confluence_kwargs ,
)
)
@staticmethod
@staticmethod
@ -147,7 +157,9 @@ class ConfluenceLoader(BaseLoader):
label : Optional [ str ] = None ,
label : Optional [ str ] = None ,
cql : Optional [ str ] = None ,
cql : Optional [ str ] = None ,
include_attachments : bool = False ,
include_attachments : bool = False ,
include_comments : bool = False ,
limit : Optional [ int ] = 50 ,
limit : Optional [ int ] = 50 ,
max_pages : Optional [ int ] = 1000 ,
) - > List [ Document ] :
) - > List [ Document ] :
"""
"""
: param space_key : Space key retrieved from a confluence URL , defaults to None
: param space_key : Space key retrieved from a confluence URL , defaults to None
@ -160,8 +172,12 @@ class ConfluenceLoader(BaseLoader):
: type cql : Optional [ str ] , optional
: type cql : Optional [ str ] , optional
: param include_attachments : defaults to False
: param include_attachments : defaults to False
: type include_attachments : bool , optional
: type include_attachments : bool , optional
: param limit : Maximum number of pages to retrieve , defaults to 50
: param include_comments : defaults to False
: type include_comments : bool , optional
: param limit : Maximum number of pages to retrieve per request , defaults to 50
: type limit : int , optional
: type limit : int , optional
: param max_pages : Maximum number of pages to retrieve in total , defaults 1000
: type max_pages : int , optional
: raises ValueError : _description_
: raises ValueError : _description_
: raises ImportError : _description_
: raises ImportError : _description_
: return : _description_
: return : _description_
@ -191,10 +207,13 @@ class ConfluenceLoader(BaseLoader):
self . confluence . get_all_pages_from_space ,
self . confluence . get_all_pages_from_space ,
space = space_key ,
space = space_key ,
limit = limit ,
limit = limit ,
max_pages = max_pages ,
expand = " body.storage.value " ,
expand = " body.storage.value " ,
)
)
for page in pages :
for page in pages :
doc = self . process_page ( page , include_attachments , text_maker )
doc = self . process_page (
page , include_attachments , include_comments , text_maker
)
docs . append ( doc )
docs . append ( doc )
if label :
if label :
@ -202,18 +221,27 @@ class ConfluenceLoader(BaseLoader):
self . confluence . get_all_pages_by_label ,
self . confluence . get_all_pages_by_label ,
label = label ,
label = label ,
limit = limit ,
limit = limit ,
max_pages = max_pages ,
expand = " body.storage.value " ,
expand = " body.storage.value " ,
)
)
for page in pages :
for page in pages :
doc = self . process_page ( page , include_attachments , text_maker )
doc = self . process_page (
page , include_attachments , include_comments , text_maker
)
docs . append ( doc )
docs . append ( doc )
if cql :
if cql :
pages = self . paginate_request (
pages = self . paginate_request (
self . confluence . cql , cql = cql , limit = limit , expand = " body.storage.value "
self . confluence . cql ,
cql = cql ,
limit = limit ,
max_pages = max_pages ,
expand = " body.storage.value " ,
)
)
for page in pages :
for page in pages :
doc = self . process_page ( page , include_attachments , text_maker )
doc = self . process_page (
page , include_attachments , include_comments , text_maker
)
docs . append ( doc )
docs . append ( doc )
if page_ids :
if page_ids :
@ -231,7 +259,9 @@ class ConfluenceLoader(BaseLoader):
before_sleep = before_sleep_log ( logger , logging . WARNING ) ,
before_sleep = before_sleep_log ( logger , logging . WARNING ) ,
) ( self . confluence . get_page_by_id )
) ( self . confluence . get_page_by_id )
page = get_page ( page_id = page_id , expand = " body.storage.value " )
page = get_page ( page_id = page_id , expand = " body.storage.value " )
doc = self . process_page ( page , include_attachments , text_maker )
doc = self . process_page (
page , include_attachments , include_comments , text_maker
)
docs . append ( doc )
docs . append ( doc )
return docs
return docs
@ -239,11 +269,13 @@ class ConfluenceLoader(BaseLoader):
def paginate_request ( self , retrieval_method : Callable , * * kwargs : Any ) - > List :
def paginate_request ( self , retrieval_method : Callable , * * kwargs : Any ) - > List :
""" Paginate the various methods to retrieve groups of pages.
""" Paginate the various methods to retrieve groups of pages.
Unforunately , due to page size , sometimes the Confluence API
Unfortunately , due to page size , sometimes the Confluence API
doesn ' t match the limit value. Also, due to the Atlassian Python
doesn ' t match the limit value. If `limit` is >100 confluence
seems to cap the response to 100. Also , due to the Atlassian Python
package , we don ' t get the " next " values from the " _links " key because
package , we don ' t get the " next " values from the " _links " key because
they only return the value from the results key . So here , the pagination
they only return the value from the results key . So here , the pagination
starts from 0 and goes until the limit . We have to manually check if there
starts from 0 and goes until the max_pages , getting the ` limit ` number
of pages with each request . We have to manually check if there
are more docs based on the length of the returned list of pages , rather than
are more docs based on the length of the returned list of pages , rather than
just checking for the presence of a ` next ` key in the response like this page
just checking for the presence of a ` next ` key in the response like this page
would have you do :
would have you do :
@ -255,10 +287,9 @@ class ConfluenceLoader(BaseLoader):
: rtype : List
: rtype : List
"""
"""
limit = kwargs [ " limit " ]
max_pages = kwargs . pop ( " max_pages " )
page = 0
docs : List [ dict ] = [ ]
docs = [ ]
while len ( docs ) < max_pages :
while page < limit :
get_pages = retry (
get_pages = retry (
reraise = True ,
reraise = True ,
stop = stop_after_attempt (
stop = stop_after_attempt (
@ -271,16 +302,18 @@ class ConfluenceLoader(BaseLoader):
) ,
) ,
before_sleep = before_sleep_log ( logger , logging . WARNING ) ,
before_sleep = before_sleep_log ( logger , logging . WARNING ) ,
) ( retrieval_method )
) ( retrieval_method )
batch = get_pages ( * * kwargs , start = page )
batch = get_pages ( * * kwargs , start = len ( docs ) )
if len ( batch ) < limit :
if not batch :
page = limit
break
else :
page + = len ( batch )
docs . extend ( batch )
docs . extend ( batch )
return docs
return docs [ : max_pages ]
def process_page (
def process_page (
self , page : dict , include_attachments : bool , text_maker : Any
self ,
page : dict ,
include_attachments : bool ,
include_comments : bool ,
text_maker : Any ,
) - > Document :
) - > Document :
if include_attachments :
if include_attachments :
attachment_texts = self . process_attachment ( page [ " id " ] )
attachment_texts = self . process_attachment ( page [ " id " ] )
@ -289,8 +322,23 @@ class ConfluenceLoader(BaseLoader):
text = text_maker . handle ( page [ " body " ] [ " storage " ] [ " value " ] ) + " " . join (
text = text_maker . handle ( page [ " body " ] [ " storage " ] [ " value " ] ) + " " . join (
attachment_texts
attachment_texts
)
)
if include_comments :
comments = self . confluence . get_page_comments (
page [ " id " ] , expand = " body.view.value " , depth = " all "
) [ " results " ]
comment_texts = [
text_maker . handle ( comment [ " body " ] [ " view " ] [ " value " ] )
for comment in comments
]
text = text + " " . join ( comment_texts )
return Document (
return Document (
page_content = text , metadata = { " title " : page [ " title " ] , " id " : page [ " id " ] }
page_content = text ,
metadata = {
" title " : page [ " title " ] ,
" id " : page [ " id " ] ,
" source " : self . base_url . strip ( " / " ) + page [ " _links " ] [ " webui " ] ,
} ,
)
)
def process_attachment ( self , page_id : str ) - > List [ str ] :
def process_attachment ( self , page_id : str ) - > List [ str ] :