Add lazy iteration interface to document loaders (#3659)

Adding a lazy iteration for document loaders.

Following the plan here:
https://github.com/hwchase17/langchain/pull/2833

Keeping the `load` method as is for backwards compatibility. The `load`
returns a materialized list of documents and downstream users may rely on that
fact.

A new method that returns an iterable is introduced for handling lazy
loading.

---------

Co-authored-by: Zander Chase <130414180+vowelparrot@users.noreply.github.com>
fix_agent_callbacks
Eugene Yurtsev 1 year ago committed by GitHub
parent 8a54217e7b
commit 2052e70664
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,15 +1,25 @@
"""Base loader class."""
"""Abstract interface for document loader implementations."""
from abc import ABC, abstractmethod
from typing import List, Optional
from typing import Iterable, List, Optional
from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
class BaseLoader(ABC):
"""Base loader class."""
"""Interface for loading documents.
Implementations should implement the lazy-loading method using generators
to avoid loading all documents into memory at once.
The `load` method will remain as is for backwards compatibility, but it's
implementation should be just `list(self.lazy_load())`.
"""
# Sub-classes should implement this method
# as return list(self.lazy_load()).
# This method returns a List which is materialized in memory.
@abstractmethod
def load(self) -> List[Document]:
"""Load data into document objects."""
@ -24,3 +34,13 @@ class BaseLoader(ABC):
_text_splitter = text_splitter
docs = self.load()
return _text_splitter.split_documents(docs)
# Attention: This method will be upgraded into an abstractmethod once it's
# implemented in all the existing subclasses.
def lazy_load(
self,
) -> Iterable[Document]:
"""A lazy loader for document content."""
raise NotImplementedError(
f"{self.__class__.__name__} does not implement lazy_load()"
)

Loading…
Cancel
Save