diff --git a/langchain/document_loaders/confluence.py b/langchain/document_loaders/confluence.py index b7608f23e7..9619071332 100644 --- a/langchain/document_loaders/confluence.py +++ b/langchain/document_loaders/confluence.py @@ -589,11 +589,23 @@ class ConfluenceLoader(BaseLoader): return docx2txt.process(file_data) def process_xls(self, link: str) -> str: + import io + import os + try: import xlrd # noqa: F401 + except ImportError: raise ImportError("`xlrd` package not found, please run `pip install xlrd`") + try: + import pandas as pd + + except ImportError: + raise ImportError( + "`pandas` package not found, please run `pip install pandas`" + ) + response = self.confluence.request(path=link, absolute=True) text = "" @@ -604,14 +616,26 @@ class ConfluenceLoader(BaseLoader): ): return text - workbook = xlrd.open_workbook(file_contents=response.content) - for sheet in workbook.sheets(): - text += f"{sheet.name}:\n" - for row in range(sheet.nrows): - for col in range(sheet.ncols): - text += f"{sheet.cell_value(row, col)}\t" + filename = os.path.basename(link) + # Getting the whole content of the url after filename, + # Example: ".csv?version=2&modificationDate=1631800010678&cacheVersion=1&api=v2" + file_extension = os.path.splitext(filename)[1] + + if file_extension.startswith( + ".csv" + ): # if the extension found in the url is ".csv" + content_string = response.content.decode("utf-8") + df = pd.read_csv(io.StringIO(content_string)) + text += df.to_string(index=False, header=False) + "\n\n" + else: + workbook = xlrd.open_workbook(file_contents=response.content) + for sheet in workbook.sheets(): + text += f"{sheet.name}:\n" + for row in range(sheet.nrows): + for col in range(sheet.ncols): + text += f"{sheet.cell_value(row, col)}\t" + text += "\n" text += "\n" - text += "\n" return text