Add BlobParser abstraction (#3979)

This PR adds the BlobParser abstraction.

It follows the proposal described here:
https://github.com/hwchase17/langchain/pull/2833#issuecomment-1509097756
pull/4210/head
Eugene Yurtsev 1 year ago committed by GitHub
parent 5ca13cc1f0
commit 423f497168
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -1,9 +1,10 @@
"""Abstract interface for document loader implementations."""
import abc
from abc import ABC, abstractmethod
from typing import Iterable, List, Optional
from typing import Iterable, Iterator, List, Optional
from langchain.docstore.document import Document
from langchain.document_loaders.blob_loaders import Blob
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter
@ -44,3 +45,44 @@ class BaseLoader(ABC):
raise NotImplementedError(
f"{self.__class__.__name__} does not implement lazy_load()"
)
class BaseBlobParser(abc.ABC):
"""Abstract interface for blob parsers.
A blob parser is provides a way to parse raw data stored in a blob into one
or more documents.
The parser can be composed with blob loaders, making it easy to re-use
a parser independent of how the blob was originally loaded.
"""
@abc.abstractmethod
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazy parsing interface.
Subclasses are required to implement this method.
Args:
blob: Blob instance
Returns:
Generator of documents
"""
def parse(self, blob: Blob) -> List[Document]:
"""Eagerly parse the blob into a document or documents.
This is a convenience method for interactive development environment.
Production applications should favor the lazy_parse method instead.
Subclasses should generally not over-ride this parse method.
Args:
blob: Blob instance
Returns:
List of documents
"""
return list(self.lazy_parse(blob))

@ -0,0 +1,28 @@
"""Test Base Schema of documents."""
from typing import Iterator
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
from langchain.schema import Document
def test_base_blob_parser() -> None:
"""Verify that the eager method is hooked up to the lazy method by default."""
class MyParser(BaseBlobParser):
"""A simple parser that returns a single document."""
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazy parsing interface."""
yield Document(
page_content="foo",
)
parser = MyParser()
assert isinstance(parser.lazy_parse(Blob(data="who?")), Iterator)
# We're verifying that the eager method is hooked up to the lazy method by default.
docs = parser.parse(Blob(data="who?"))
assert len(docs) == 1
assert docs[0].page_content == "foo"
Loading…
Cancel
Save