mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
Fixed the confluence loader ".csv" files loading issue (#7195)
- Description: Sometimes there are csv attachments with the media type "application/vnd.ms-excel". These files failed to be loaded via the xlrd library. It throws a corrupted file error. I fixed it by separately processing excel files using pandas. Excel files will be processed just like before. - Dependencies: pandas, os, io --------- Co-authored-by: Chathura <chathurar@yaalalabs.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
b21c2f8704
commit
ec10787bc7
@ -589,11 +589,23 @@ class ConfluenceLoader(BaseLoader):
|
||||
return docx2txt.process(file_data)
|
||||
|
||||
def process_xls(self, link: str) -> str:
|
||||
import io
|
||||
import os
|
||||
|
||||
try:
|
||||
import xlrd # noqa: F401
|
||||
|
||||
except ImportError:
|
||||
raise ImportError("`xlrd` package not found, please run `pip install xlrd`")
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"`pandas` package not found, please run `pip install pandas`"
|
||||
)
|
||||
|
||||
response = self.confluence.request(path=link, absolute=True)
|
||||
text = ""
|
||||
|
||||
@ -604,6 +616,18 @@ class ConfluenceLoader(BaseLoader):
|
||||
):
|
||||
return text
|
||||
|
||||
filename = os.path.basename(link)
|
||||
# Getting the whole content of the url after filename,
|
||||
# Example: ".csv?version=2&modificationDate=1631800010678&cacheVersion=1&api=v2"
|
||||
file_extension = os.path.splitext(filename)[1]
|
||||
|
||||
if file_extension.startswith(
|
||||
".csv"
|
||||
): # if the extension found in the url is ".csv"
|
||||
content_string = response.content.decode("utf-8")
|
||||
df = pd.read_csv(io.StringIO(content_string))
|
||||
text += df.to_string(index=False, header=False) + "\n\n"
|
||||
else:
|
||||
workbook = xlrd.open_workbook(file_contents=response.content)
|
||||
for sheet in workbook.sheets():
|
||||
text += f"{sheet.name}:\n"
|
||||
|
Loading…
Reference in New Issue
Block a user