From ec10787bc758391898183aa280e02c9c41b8b862 Mon Sep 17 00:00:00 2001 From: Chathura Rathnayake <49788133+Chathura-Rathnayake@users.noreply.github.com> Date: Fri, 7 Jul 2023 02:51:43 +0530 Subject: [PATCH] Fixed the confluence loader ".csv" files loading issue (#7195) - Description: Sometimes there are csv attachments with the media type "application/vnd.ms-excel". These files failed to be loaded via the xlrd library. It throws a corrupted file error. I fixed it by separately processing excel files using pandas. Excel files will be processed just like before. - Dependencies: pandas, os, io --------- Co-authored-by: Chathura Co-authored-by: Bagatur --- langchain/document_loaders/confluence.py | 38 +++++++++++++++++++----- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/langchain/document_loaders/confluence.py b/langchain/document_loaders/confluence.py index b7608f23e7..9619071332 100644 --- a/langchain/document_loaders/confluence.py +++ b/langchain/document_loaders/confluence.py @@ -589,11 +589,23 @@ class ConfluenceLoader(BaseLoader): return docx2txt.process(file_data) def process_xls(self, link: str) -> str: + import io + import os + try: import xlrd # noqa: F401 + except ImportError: raise ImportError("`xlrd` package not found, please run `pip install xlrd`") + try: + import pandas as pd + + except ImportError: + raise ImportError( + "`pandas` package not found, please run `pip install pandas`" + ) + response = self.confluence.request(path=link, absolute=True) text = "" @@ -604,14 +616,26 @@ class ConfluenceLoader(BaseLoader): ): return text - workbook = xlrd.open_workbook(file_contents=response.content) - for sheet in workbook.sheets(): - text += f"{sheet.name}:\n" - for row in range(sheet.nrows): - for col in range(sheet.ncols): - text += f"{sheet.cell_value(row, col)}\t" + filename = os.path.basename(link) + # Getting the whole content of the url after filename, + # Example: ".csv?version=2&modificationDate=1631800010678&cacheVersion=1&api=v2" + file_extension = os.path.splitext(filename)[1] + + if file_extension.startswith( + ".csv" + ): # if the extension found in the url is ".csv" + content_string = response.content.decode("utf-8") + df = pd.read_csv(io.StringIO(content_string)) + text += df.to_string(index=False, header=False) + "\n\n" + else: + workbook = xlrd.open_workbook(file_contents=response.content) + for sheet in workbook.sheets(): + text += f"{sheet.name}:\n" + for row in range(sheet.nrows): + for col in range(sheet.ncols): + text += f"{sheet.cell_value(row, col)}\t" + text += "\n" text += "\n" - text += "\n" return text