Add ConcurrentLoader (#7512)

Works just like the GenericLoader but concurrently for those who choose to optimize their workflow. @rlancemartin @eyurtsev --------- Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
2024-11-04 06:00:26 +00:00 · 2023-07-31 20:56:31 -04:00 · 2023-07-31 20:56:31 -04:00 · 1e8fca5518
commit 1e8fca5518
parent 8061994c61
4 changed files with 211 additions and 0 deletions
--- a/docs/extras/integrations/document_loaders/concurrent.ipynb
+++ b/docs/extras/integrations/document_loaders/concurrent.ipynb
@ -0,0 +1,94 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "23c6e167",
+   "metadata": {},
+   "source": [
+    "# Concurrent Loader\n",
+    "\n",
+    "Works just like the GenericLoader but concurrently for those who choose to optimize their workflow.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "6ff3fb1f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.document_loaders import ConcurrentLoader"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "ce96fa20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "loader = ConcurrentLoader.from_filesystem('example_data/', glob=\"**/*.txt\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "06a6cf5d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "files = loader.load()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "b87d3e58",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "2"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(files)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "668f1ee5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/libs/langchain/langchain/document_loaders/init.py
+++ b/libs/langchain/langchain/document_loaders/init.py
@ -28,6 +28,7 @@ from langchain.document_loaders.brave_search import BraveSearchLoader
 from langchain.document_loaders.browserless import BrowserlessLoader
 from langchain.document_loaders.chatgpt import ChatGPTLoader
 from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
+from langchain.document_loaders.concurrent import ConcurrentLoader
 from langchain.document_loaders.confluence import ConfluenceLoader
 from langchain.document_loaders.conllu import CoNLLULoader
 from langchain.document_loaders.csv_loader import CSVLoader, UnstructuredCSVLoader
@ -305,4 +306,5 @@ __all__ = [
    "XorbitsLoader",
    "YoutubeAudioLoader",
    "YoutubeLoader",
+    "ConcurrentLoader",
 ]
--- a/libs/langchain/langchain/document_loaders/concurrent.py
+++ b/libs/langchain/langchain/document_loaders/concurrent.py
@ -0,0 +1,65 @@
+from __future__ import annotations
+
+import concurrent.futures
+from pathlib import Path
+from typing import Iterator, Literal, Optional, Sequence, Union
+
+from langchain.document_loaders.base import BaseBlobParser
+from langchain.document_loaders.blob_loaders import BlobLoader, FileSystemBlobLoader
+from langchain.document_loaders.generic import GenericLoader
+from langchain.document_loaders.parsers.registry import get_parser
+from langchain.schema import Document
+
+_PathLike = Union[str, Path]
+
+DEFAULT = Literal["default"]
+
+
+class ConcurrentLoader(GenericLoader):
+    """
+    A generic document loader that loads and parses documents concurrently.
+    """
+
+    def __init__(
+        self, blob_loader: BlobLoader, blob_parser: BaseBlobParser, num_workers: int = 4
+    ) -> None:
+        super().__init__(blob_loader, blob_parser)
+        self.num_workers = num_workers
+
+    def lazy_load(
+        self,
+    ) -> Iterator[Document]:
+        """Load documents lazily with concurrent parsing."""
+        with concurrent.futures.ThreadPoolExecutor(
+            max_workers=self.num_workers
+        ) as executor:
+            futures = {
+                executor.submit(self.blob_parser.lazy_parse, blob)
+                for blob in self.blob_loader.yield_blobs()
+            }
+            for future in concurrent.futures.as_completed(futures):
+                yield from future.result()
+
+    @classmethod
+    def from_filesystem(
+        cls,
+        path: _PathLike,
+        *,
+        glob: str = "**/[!.]*",
+        suffixes: Optional[Sequence[str]] = None,
+        show_progress: bool = False,
+        parser: Union[DEFAULT, BaseBlobParser] = "default",
+        num_workers: int = 4,
+    ) -> ConcurrentLoader:
+        """
+        Create a concurrent generic document loader using a
+        filesystem blob loader.
+        """
+        blob_loader = FileSystemBlobLoader(
+            path, glob=glob, suffixes=suffixes, show_progress=show_progress
+        )
+        if isinstance(parser, str):
+            blob_parser = get_parser(parser)
+        else:
+            blob_parser = parser
+        return cls(blob_loader, blob_parser, num_workers)
--- a/libs/langchain/tests/integration_tests/document_loaders/parsers/test_language.py
+++ b/libs/langchain/tests/integration_tests/document_loaders/parsers/test_language.py
@ -2,6 +2,7 @@ from pathlib import Path

 import pytest

+from langchain.document_loaders.concurrent import ConcurrentLoader
 from langchain.document_loaders.generic import GenericLoader
 from langchain.document_loaders.parsers import LanguageParser
 from langchain.text_splitter import Language
@ -131,3 +132,52 @@ def test_language_loader_for_javascript_with_parser_threshold() -> None:
    docs = loader.load()

    assert len(docs) == 1
+
+
+def test_concurrent_language_loader_for_javascript_with_parser_threshold() -> None:
+    """Test JavaScript ConcurrentLoader with parser enabled and below threshold."""
+    file_path = Path(__file__).parent.parent.parent / "examples"
+    loader = ConcurrentLoader.from_filesystem(
+        file_path,
+        glob="hello_world.js",
+        parser=LanguageParser(language=Language.JS, parser_threshold=1000),
+    )
+    docs = loader.load()
+
+    assert len(docs) == 1
+
+
+def test_concurrent_language_loader_for_python_with_parser_threshold() -> None:
+    """Test Python ConcurrentLoader with parser enabled and below threshold."""
+    file_path = Path(__file__).parent.parent.parent / "examples"
+    loader = ConcurrentLoader.from_filesystem(
+        file_path,
+        glob="hello_world.py",
+        parser=LanguageParser(language=Language.PYTHON, parser_threshold=1000),
+    )
+    docs = loader.load()
+
+    assert len(docs) == 1
+
+
+@pytest.mark.skipif(not esprima_installed(), reason="requires esprima package")
+def test_concurrent_language_loader_for_javascript() -> None:
+    """Test JavaScript ConcurrentLoader with parser enabled."""
+    file_path = Path(__file__).parent.parent.parent / "examples"
+    loader = ConcurrentLoader.from_filesystem(
+        file_path, glob="hello_world.js", parser=LanguageParser(parser_threshold=5)
+    )
+    docs = loader.load()
+
+    assert len(docs) == 3
+
+
+def test_concurrent_language_loader_for_python() -> None:
+    """Test Python ConcurrentLoader with parser enabled."""
+    file_path = Path(__file__).parent.parent.parent / "examples"
+    loader = ConcurrentLoader.from_filesystem(
+        file_path, glob="hello_world.py", parser=LanguageParser(parser_threshold=5)
+    )
+    docs = loader.load()
+
+    assert len(docs) == 2