infra: test directory loader multithreaded (#20281)

This is a unit test for #20230 which was a fix for using multithreaded
mode with directory loader @eyurtsev
pull/20985/head
Chip Davis 3 weeks ago committed by GitHub
parent f931a9ce60
commit e818c75f8a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -8,6 +8,64 @@ from langchain_community.document_loaders.directory import DirectoryLoader
class TestDirectoryLoader:
# Tests that when multhreading is enabled, multiple documents are read successfully.
def test_directory_loader_with_multithreading_enabled(self) -> None:
dir_path = self._get_csv_dir_path()
loader = DirectoryLoader(
dir_path, glob="**/*.csv", loader_cls=CSVLoader, use_multithreading=True
)
expected_docs = [
Document(
page_content="column1: value1",
metadata={
"source": self._get_csv_file_path("test_one_col.csv"),
"row": 0,
},
),
Document(
page_content="column1: value2",
metadata={
"source": self._get_csv_file_path("test_one_col.csv"),
"row": 1,
},
),
Document(
page_content="column1: value3",
metadata={
"source": self._get_csv_file_path("test_one_col.csv"),
"row": 2,
},
),
Document(
page_content="column1: value1\ncolumn2: value2\ncolumn3: value3",
metadata={
"source": self._get_csv_file_path("test_one_row.csv"),
"row": 0,
},
),
Document(
page_content="column1: value1\ncolumn2: value2\ncolumn3: value3",
metadata={
"source": self._get_csv_file_path("test_nominal.csv"),
"row": 0,
},
),
Document(
page_content="column1: value4\ncolumn2: value5\ncolumn3: value6",
metadata={
"source": self._get_csv_file_path("test_nominal.csv"),
"row": 1,
},
),
]
loaded_docs = sorted(loader.load(), key=lambda doc: doc.metadata["source"])
expected_docs = sorted(expected_docs, key=lambda doc: doc.metadata["source"])
for i, doc in enumerate(loaded_docs):
assert doc == expected_docs[i]
# Tests that lazy loading a CSV file with multiple documents is successful.
def test_directory_loader_lazy_load_single_file_multiple_docs(self) -> None:
# Setup

Loading…
Cancel
Save