diff --git a/langchain/document_loaders/parsers/__init__.py b/langchain/document_loaders/parsers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/langchain/document_loaders/parsers/generic.py b/langchain/document_loaders/parsers/generic.py new file mode 100644 index 00000000..f2458f7d --- /dev/null +++ b/langchain/document_loaders/parsers/generic.py @@ -0,0 +1,68 @@ +"""Code for generic / auxiliary parsers. + +This module contains some logic to help assemble more sophisticated parsers. +""" +from typing import Iterator, Mapping, Optional + +from langchain.document_loaders.base import BaseBlobParser +from langchain.document_loaders.blob_loaders.schema import Blob +from langchain.schema import Document + + +class MimeTypeBasedParser(BaseBlobParser): + """A parser that uses mime-types to determine how to parse a blob. + + This parser is useful for simple pipelines where the mime-type is sufficient + to determine how to parse a blob. + + To use, configure handlers based on mime-types and pass them to the initializer. + + Example: + + .. code-block:: python + + from langchain.document_loaders.parsers.generic import MimeTypeBasedParser + + parser = MimeTypeBasedParser( + handlers={ + "application/pdf": ..., + }, + fallback_parser=..., + ) + """ + + def __init__( + self, + handlers: Mapping[str, BaseBlobParser], + fallback_parser: Optional[BaseBlobParser] = None, + ) -> None: + """Define a parser that uses mime-types to determine how to parse a blob. + + Args: + handlers: A mapping from mime-types to functions that take a blob, parse it + and return a document. + fallback_parser: A fallback_parser parser to use if the mime-type is not + found in the handlers. If provided, this parser will be + used to parse blobs with all mime-types not found in + the handlers. + If not provided, a ValueError will be raised if the + mime-type is not found in the handlers. + """ + self.handlers = handlers + self.fallback_parser = fallback_parser + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Load documents from a blob.""" + mimetype = blob.mimetype + + if mimetype is None: + raise ValueError(f"{blob} does not have a mimetype.") + + if mimetype in self.handlers: + handler = self.handlers[mimetype] + yield from handler.lazy_parse(blob) + else: + if self.fallback_parser is not None: + yield from self.fallback_parser.lazy_parse(blob) + else: + raise ValueError(f"Unsupported mime type: {mimetype}") diff --git a/tests/unit_tests/document_loader/parsers/__init__.py b/tests/unit_tests/document_loader/parsers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/unit_tests/document_loader/parsers/test_generic.py b/tests/unit_tests/document_loader/parsers/test_generic.py new file mode 100644 index 00000000..d06b4da4 --- /dev/null +++ b/tests/unit_tests/document_loader/parsers/test_generic.py @@ -0,0 +1,95 @@ +"""Module to test generic parsers.""" + +from typing import Iterator + +import pytest + +from langchain.document_loaders.base import BaseBlobParser +from langchain.document_loaders.blob_loaders import Blob +from langchain.document_loaders.parsers.generic import MimeTypeBasedParser +from langchain.schema import Document + + +class TestMimeBasedParser: + """Test mime based parser.""" + + def test_without_fallback_parser(self) -> None: + class FirstCharParser(BaseBlobParser): + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Extract the first character of a blob.""" + yield Document(page_content=blob.as_string()[0]) + + class SecondCharParser(BaseBlobParser): + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Extract the second character of a blob.""" + yield Document(page_content=blob.as_string()[1]) + + parser = MimeTypeBasedParser( + handlers={ + "text/plain": FirstCharParser(), + "text/html": SecondCharParser(), + }, + ) + + blob = Blob(data=b"Hello World", mimetype="text/plain") + docs = parser.parse(blob) + assert len(docs) == 1 + doc = docs[0] + assert doc.page_content == "H" + + # Check text/html handler. + blob = Blob(data=b"Hello World", mimetype="text/html") + docs = parser.parse(blob) + assert len(docs) == 1 + doc = docs[0] + assert doc.page_content == "e" + + blob = Blob(data=b"Hello World", mimetype="text/csv") + + with pytest.raises(ValueError, match="Unsupported mime type"): + # Check that the fallback parser is used when the mimetype is not found. + parser.parse(blob) + + def test_with_fallback_parser(self) -> None: + class FirstCharParser(BaseBlobParser): + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Extract the first character of a blob.""" + yield Document(page_content=blob.as_string()[0]) + + class SecondCharParser(BaseBlobParser): + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Extract the second character of a blob.""" + yield Document(page_content=blob.as_string()[1]) + + class ThirdCharParser(BaseBlobParser): + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Extract the third character of a blob.""" + yield Document(page_content=blob.as_string()[2]) + + parser = MimeTypeBasedParser( + handlers={ + "text/plain": FirstCharParser(), + "text/html": SecondCharParser(), + }, + fallback_parser=ThirdCharParser(), + ) + + blob = Blob(data=b"Hello World", mimetype="text/plain") + docs = parser.parse(blob) + assert len(docs) == 1 + doc = docs[0] + assert doc.page_content == "H" + + # Check text/html handler. + blob = Blob(data=b"Hello World", mimetype="text/html") + docs = parser.parse(blob) + assert len(docs) == 1 + doc = docs[0] + assert doc.page_content == "e" + + # Check that the fallback parser is used when the mimetype is not found. + blob = Blob(data=b"Hello World", mimetype="text/csv") + docs = parser.parse(blob) + assert len(docs) == 1 + doc = docs[0] + assert doc.page_content == "l"