From ec10787bc758391898183aa280e02c9c41b8b862 Mon Sep 17 00:00:00 2001
From: Chathura Rathnayake
 <49788133+Chathura-Rathnayake@users.noreply.github.com>
Date: Fri, 7 Jul 2023 02:51:43 +0530
Subject: [PATCH] Fixed the confluence loader ".csv" files loading issue
 (#7195)

- Description: Sometimes there are csv attachments with the media type
"application/vnd.ms-excel". These files failed to be loaded via the xlrd
library. It throws a corrupted file error. I fixed it by separately
processing excel files using pandas. Excel files will be processed just
like before.

- Dependencies: pandas, os, io

---------

Co-authored-by: Chathura <chathurar@yaalalabs.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
---
 langchain/document_loaders/confluence.py | 38 +++++++++++++++++++-----
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/langchain/document_loaders/confluence.py b/langchain/document_loaders/confluence.py
index b7608f23e7..9619071332 100644
--- a/langchain/document_loaders/confluence.py
+++ b/langchain/document_loaders/confluence.py
@@ -589,11 +589,23 @@ class ConfluenceLoader(BaseLoader):
         return docx2txt.process(file_data)
 
     def process_xls(self, link: str) -> str:
+        import io
+        import os
+
         try:
             import xlrd  # noqa: F401
+
         except ImportError:
             raise ImportError("`xlrd` package not found, please run `pip install xlrd`")
 
+        try:
+            import pandas as pd
+
+        except ImportError:
+            raise ImportError(
+                "`pandas` package not found, please run `pip install pandas`"
+            )
+
         response = self.confluence.request(path=link, absolute=True)
         text = ""
 
@@ -604,14 +616,26 @@ class ConfluenceLoader(BaseLoader):
         ):
             return text
 
-        workbook = xlrd.open_workbook(file_contents=response.content)
-        for sheet in workbook.sheets():
-            text += f"{sheet.name}:\n"
-            for row in range(sheet.nrows):
-                for col in range(sheet.ncols):
-                    text += f"{sheet.cell_value(row, col)}\t"
+        filename = os.path.basename(link)
+        # Getting the whole content of the url after filename,
+        # Example: ".csv?version=2&modificationDate=1631800010678&cacheVersion=1&api=v2"
+        file_extension = os.path.splitext(filename)[1]
+
+        if file_extension.startswith(
+            ".csv"
+        ):  # if the extension found in the url is ".csv"
+            content_string = response.content.decode("utf-8")
+            df = pd.read_csv(io.StringIO(content_string))
+            text += df.to_string(index=False, header=False) + "\n\n"
+        else:
+            workbook = xlrd.open_workbook(file_contents=response.content)
+            for sheet in workbook.sheets():
+                text += f"{sheet.name}:\n"
+                for row in range(sheet.nrows):
+                    for col in range(sheet.ncols):
+                        text += f"{sheet.cell_value(row, col)}\t"
+                    text += "\n"
                 text += "\n"
-            text += "\n"
 
         return text