From c14a8df2eee39b4143304dfae0f492869a940d79 Mon Sep 17 00:00:00 2001 From: April <35612351+aprilyw@users.noreply.github.com> Date: Wed, 11 Oct 2023 20:13:42 -0500 Subject: [PATCH] wrap confluence attachment processing with a try-except block (#11503) Prevents document loading from erroring out when an attachment is not found at the url. --------- Co-authored-by: Bagatur --- .../langchain/document_loaders/confluence.py | 47 +++++++++++-------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/libs/langchain/langchain/document_loaders/confluence.py b/libs/langchain/langchain/document_loaders/confluence.py index 5a12e8984f..cb17c0f4b2 100644 --- a/libs/langchain/langchain/document_loaders/confluence.py +++ b/libs/langchain/langchain/document_loaders/confluence.py @@ -541,26 +541,33 @@ class ConfluenceLoader(BaseLoader): media_type = attachment["metadata"]["mediaType"] absolute_url = self.base_url + attachment["_links"]["download"] title = attachment["title"] - if media_type == "application/pdf": - text = title + self.process_pdf(absolute_url, ocr_languages) - elif ( - media_type == "image/png" - or media_type == "image/jpg" - or media_type == "image/jpeg" - ): - text = title + self.process_image(absolute_url, ocr_languages) - elif ( - media_type == "application/vnd.openxmlformats-officedocument" - ".wordprocessingml.document" - ): - text = title + self.process_doc(absolute_url) - elif media_type == "application/vnd.ms-excel": - text = title + self.process_xls(absolute_url) - elif media_type == "image/svg+xml": - text = title + self.process_svg(absolute_url, ocr_languages) - else: - continue - texts.append(text) + try: + if media_type == "application/pdf": + text = title + self.process_pdf(absolute_url, ocr_languages) + elif ( + media_type == "image/png" + or media_type == "image/jpg" + or media_type == "image/jpeg" + ): + text = title + self.process_image(absolute_url, ocr_languages) + elif ( + media_type == "application/vnd.openxmlformats-officedocument" + ".wordprocessingml.document" + ): + text = title + self.process_doc(absolute_url) + elif media_type == "application/vnd.ms-excel": + text = title + self.process_xls(absolute_url) + elif media_type == "image/svg+xml": + text = title + self.process_svg(absolute_url, ocr_languages) + else: + continue + texts.append(text) + except requests.HTTPError as e: + if e.response.status_code == 404: + print(f"Attachment not found at {absolute_url}") + continue + else: + raise return texts