This commit is contained in:
Davis Chase 2023-05-30 10:42:20 -07:00 committed by GitHub
parent 64b4165c8d
commit 2649b638dd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -4,8 +4,6 @@ import logging
import sys import sys
from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Tuple from typing import TYPE_CHECKING, Any, Iterator, List, Optional, Tuple
import psutil
from langchain.docstore.document import Document from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader from langchain.document_loaders.base import BaseLoader
@ -29,7 +27,7 @@ class PySparkDataFrameLoader(BaseLoader):
try: try:
from pyspark.sql import DataFrame, SparkSession from pyspark.sql import DataFrame, SparkSession
except ImportError: except ImportError:
raise ValueError( raise ImportError(
"pyspark is not installed. " "pyspark is not installed. "
"Please install it with `pip install pyspark`" "Please install it with `pip install pyspark`"
) )
@ -51,6 +49,12 @@ class PySparkDataFrameLoader(BaseLoader):
def get_num_rows(self) -> Tuple[int, int]: def get_num_rows(self) -> Tuple[int, int]:
"""Gets the amount of "feasible" rows for the DataFrame""" """Gets the amount of "feasible" rows for the DataFrame"""
try:
import psutil
except ImportError as e:
raise ImportError(
"psutil not installed. Please install it with `pip install psutil`."
) from e
row = self.df.limit(1).collect()[0] row = self.df.limit(1).collect()[0]
estimated_row_size = sys.getsizeof(row) estimated_row_size = sys.getsizeof(row)
mem_info = psutil.virtual_memory() mem_info = psutil.virtual_memory()