From 2052e70664c747637200ef0ad86fa53ba8b99ab9 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Thu, 27 Apr 2023 14:29:01 -0400 Subject: [PATCH] Add lazy iteration interface to document loaders (#3659) Adding a lazy iteration for document loaders. Following the plan here: https://github.com/hwchase17/langchain/pull/2833 Keeping the `load` method as is for backwards compatibility. The `load` returns a materialized list of documents and downstream users may rely on that fact. A new method that returns an iterable is introduced for handling lazy loading. --------- Co-authored-by: Zander Chase <130414180+vowelparrot@users.noreply.github.com> --- langchain/document_loaders/base.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/langchain/document_loaders/base.py b/langchain/document_loaders/base.py index d5784a74..b89f8785 100644 --- a/langchain/document_loaders/base.py +++ b/langchain/document_loaders/base.py @@ -1,15 +1,25 @@ -"""Base loader class.""" +"""Abstract interface for document loader implementations.""" from abc import ABC, abstractmethod -from typing import List, Optional +from typing import Iterable, List, Optional from langchain.docstore.document import Document from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter class BaseLoader(ABC): - """Base loader class.""" + """Interface for loading documents. + Implementations should implement the lazy-loading method using generators + to avoid loading all documents into memory at once. + + The `load` method will remain as is for backwards compatibility, but it's + implementation should be just `list(self.lazy_load())`. + """ + + # Sub-classes should implement this method + # as return list(self.lazy_load()). + # This method returns a List which is materialized in memory. @abstractmethod def load(self) -> List[Document]: """Load data into document objects.""" @@ -24,3 +34,13 @@ class BaseLoader(ABC): _text_splitter = text_splitter docs = self.load() return _text_splitter.split_documents(docs) + + # Attention: This method will be upgraded into an abstractmethod once it's + # implemented in all the existing subclasses. + def lazy_load( + self, + ) -> Iterable[Document]: + """A lazy loader for document content.""" + raise NotImplementedError( + f"{self.__class__.__name__} does not implement lazy_load()" + )