Fixed the confluence loader ".csv" files loading issue (#7195)

- Description: Sometimes there are csv attachments with the media type "application/vnd.ms-excel". These files failed to be loaded via the xlrd library. It throws a corrupted file error. I fixed it by separately processing excel files using pandas. Excel files will be processed just like before. - Dependencies: pandas, os, io --------- Co-authored-by: Chathura <chathurar@yaalalabs.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
2024-11-06 03:20:49 +00:00 · 2023-07-07 02:51:43 +05:30 · 2023-07-07 02:51:43 +05:30 · ec10787bc7
commit ec10787bc7
parent b21c2f8704
1 changed files with 31 additions and 7 deletions
--- a/langchain/document_loaders/confluence.py
+++ b/langchain/document_loaders/confluence.py
@ -589,11 +589,23 @@ class ConfluenceLoader(BaseLoader):
        return docx2txt.process(file_data)

    def process_xls(self, link: str) -> str:
+        import io
+        import os
+
        try:
            import xlrd  # noqa: F401
+
        except ImportError:
            raise ImportError("`xlrd` package not found, please run `pip install xlrd`")

+        try:
+            import pandas as pd
+
+        except ImportError:
+            raise ImportError(
+                "`pandas` package not found, please run `pip install pandas`"
+            )
+
        response = self.confluence.request(path=link, absolute=True)
        text = ""

@ -604,14 +616,26 @@ class ConfluenceLoader(BaseLoader):
        ):
            return text

-        workbook = xlrd.open_workbook(file_contents=response.content)
-        for sheet in workbook.sheets():
-            text += f"{sheet.name}:\n"
-            for row in range(sheet.nrows):
-                for col in range(sheet.ncols):
-                    text += f"{sheet.cell_value(row, col)}\t"
+        filename = os.path.basename(link)
+        # Getting the whole content of the url after filename,
+        # Example: ".csv?version=2&modificationDate=1631800010678&cacheVersion=1&api=v2"
+        file_extension = os.path.splitext(filename)[1]
+
+        if file_extension.startswith(
+            ".csv"
+        ):  # if the extension found in the url is ".csv"
+            content_string = response.content.decode("utf-8")
+            df = pd.read_csv(io.StringIO(content_string))
+            text += df.to_string(index=False, header=False) + "\n\n"
+        else:
+            workbook = xlrd.open_workbook(file_contents=response.content)
+            for sheet in workbook.sheets():
+                text += f"{sheet.name}:\n"
+                for row in range(sheet.nrows):
+                    for col in range(sheet.ncols):
+                        text += f"{sheet.cell_value(row, col)}\t"
+                    text += "\n"
                text += "\n"
-            text += "\n"

        return text