|
|
@ -541,26 +541,33 @@ class ConfluenceLoader(BaseLoader):
|
|
|
|
media_type = attachment["metadata"]["mediaType"]
|
|
|
|
media_type = attachment["metadata"]["mediaType"]
|
|
|
|
absolute_url = self.base_url + attachment["_links"]["download"]
|
|
|
|
absolute_url = self.base_url + attachment["_links"]["download"]
|
|
|
|
title = attachment["title"]
|
|
|
|
title = attachment["title"]
|
|
|
|
if media_type == "application/pdf":
|
|
|
|
try:
|
|
|
|
text = title + self.process_pdf(absolute_url, ocr_languages)
|
|
|
|
if media_type == "application/pdf":
|
|
|
|
elif (
|
|
|
|
text = title + self.process_pdf(absolute_url, ocr_languages)
|
|
|
|
media_type == "image/png"
|
|
|
|
elif (
|
|
|
|
or media_type == "image/jpg"
|
|
|
|
media_type == "image/png"
|
|
|
|
or media_type == "image/jpeg"
|
|
|
|
or media_type == "image/jpg"
|
|
|
|
):
|
|
|
|
or media_type == "image/jpeg"
|
|
|
|
text = title + self.process_image(absolute_url, ocr_languages)
|
|
|
|
):
|
|
|
|
elif (
|
|
|
|
text = title + self.process_image(absolute_url, ocr_languages)
|
|
|
|
media_type == "application/vnd.openxmlformats-officedocument"
|
|
|
|
elif (
|
|
|
|
".wordprocessingml.document"
|
|
|
|
media_type == "application/vnd.openxmlformats-officedocument"
|
|
|
|
):
|
|
|
|
".wordprocessingml.document"
|
|
|
|
text = title + self.process_doc(absolute_url)
|
|
|
|
):
|
|
|
|
elif media_type == "application/vnd.ms-excel":
|
|
|
|
text = title + self.process_doc(absolute_url)
|
|
|
|
text = title + self.process_xls(absolute_url)
|
|
|
|
elif media_type == "application/vnd.ms-excel":
|
|
|
|
elif media_type == "image/svg+xml":
|
|
|
|
text = title + self.process_xls(absolute_url)
|
|
|
|
text = title + self.process_svg(absolute_url, ocr_languages)
|
|
|
|
elif media_type == "image/svg+xml":
|
|
|
|
else:
|
|
|
|
text = title + self.process_svg(absolute_url, ocr_languages)
|
|
|
|
continue
|
|
|
|
else:
|
|
|
|
texts.append(text)
|
|
|
|
continue
|
|
|
|
|
|
|
|
texts.append(text)
|
|
|
|
|
|
|
|
except requests.HTTPError as e:
|
|
|
|
|
|
|
|
if e.response.status_code == 404:
|
|
|
|
|
|
|
|
print(f"Attachment not found at {absolute_url}")
|
|
|
|
|
|
|
|
continue
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
|
|
|
raise
|
|
|
|
|
|
|
|
|
|
|
|
return texts
|
|
|
|
return texts
|
|
|
|
|
|
|
|
|
|
|
|