Fixed the confluence loader ".csv" files loading issue (#7195)

- Description: Sometimes there are csv attachments with the media type
"application/vnd.ms-excel". These files failed to be loaded via the xlrd
library. It throws a corrupted file error. I fixed it by separately
processing excel files using pandas. Excel files will be processed just
like before.

- Dependencies: pandas, os, io

---------

Co-authored-by: Chathura <chathurar@yaalalabs.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
Chathura Rathnayake 2023-07-07 02:51:43 +05:30 committed by GitHub
parent b21c2f8704
commit ec10787bc7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -589,11 +589,23 @@ class ConfluenceLoader(BaseLoader):
return docx2txt.process(file_data)
def process_xls(self, link: str) -> str:
import io
import os
try:
import xlrd # noqa: F401
except ImportError:
raise ImportError("`xlrd` package not found, please run `pip install xlrd`")
try:
import pandas as pd
except ImportError:
raise ImportError(
"`pandas` package not found, please run `pip install pandas`"
)
response = self.confluence.request(path=link, absolute=True)
text = ""
@ -604,14 +616,26 @@ class ConfluenceLoader(BaseLoader):
):
return text
workbook = xlrd.open_workbook(file_contents=response.content)
for sheet in workbook.sheets():
text += f"{sheet.name}:\n"
for row in range(sheet.nrows):
for col in range(sheet.ncols):
text += f"{sheet.cell_value(row, col)}\t"
filename = os.path.basename(link)
# Getting the whole content of the url after filename,
# Example: ".csv?version=2&modificationDate=1631800010678&cacheVersion=1&api=v2"
file_extension = os.path.splitext(filename)[1]
if file_extension.startswith(
".csv"
): # if the extension found in the url is ".csv"
content_string = response.content.decode("utf-8")
df = pd.read_csv(io.StringIO(content_string))
text += df.to_string(index=False, header=False) + "\n\n"
else:
workbook = xlrd.open_workbook(file_contents=response.content)
for sheet in workbook.sheets():
text += f"{sheet.name}:\n"
for row in range(sheet.nrows):
for col in range(sheet.ncols):
text += f"{sheet.cell_value(row, col)}\t"
text += "\n"
text += "\n"
text += "\n"
return text