From 8e41143bf552cc697dbab6a8bf3f96ea01cce387 Mon Sep 17 00:00:00 2001 From: Eugene Yurtsev Date: Wed, 17 May 2023 22:38:55 -0400 Subject: [PATCH] Add a generic document loader (#4875) # Add generic document loader * This PR adds a generic document loader which can assemble a loader from a blob loader and a parser * Adds a registry for parsers * Populate registry with a default mimetype based parser ## Expected changes - Parsing involves loading content via IO so can be sped up via: * Threading in sync * Async - The actual parsing logic may be computatinoally involved: may need to figure out to add multi-processing support - May want to add suffix based parser since suffixes are easier to specify in comparison to mime types ## Before submitting No notebooks yet, we first need to get a few of the basic parsers up (prior to advertising the interface) --- langchain/document_loaders/generic.py | 133 ++++++++++++++++++ langchain/document_loaders/parsers/generic.py | 1 + .../document_loaders/parsers/registry.py | 30 ++++ langchain/document_loaders/parsers/txt.py | 12 ++ .../document_loaders/test_generic_loader.py | 114 +++++++++++++++ 5 files changed, 290 insertions(+) create mode 100644 langchain/document_loaders/generic.py create mode 100644 langchain/document_loaders/parsers/registry.py create mode 100644 langchain/document_loaders/parsers/txt.py create mode 100644 tests/unit_tests/document_loaders/test_generic_loader.py diff --git a/langchain/document_loaders/generic.py b/langchain/document_loaders/generic.py new file mode 100644 index 00000000..0625507b --- /dev/null +++ b/langchain/document_loaders/generic.py @@ -0,0 +1,133 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Iterator, List, Literal, Optional, Sequence, Union + +from langchain.document_loaders.base import BaseBlobParser, BaseLoader +from langchain.document_loaders.blob_loaders import BlobLoader, FileSystemBlobLoader +from langchain.document_loaders.parsers.registry import get_parser +from langchain.schema import Document +from langchain.text_splitter import TextSplitter + +_PathLike = Union[str, Path] + +DEFAULT = Literal["default"] + + +class GenericLoader(BaseLoader): + """A generic document loader. + + A generic document loader that allows combining an arbitrary blob loader with + a blob parser. + + Examples: + + .. code-block:: python + + from langchain.document_loaders import GenericLoader + from langchain.document_loaders.blob_loaders import FileSystemBlobLoader + + loader = GenericLoader.from_filesystem( + path="path/to/directory", + glob="**/[!.]*", + suffixes=[".pdf"], + show_progress=True, + ) + + docs = loader.lazy_load() + next(docs) + + Example instantiations to change which files are loaded: + + ... code-block:: python + + # Recursively load all text files in a directory. + loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/*.txt") + + # Recursively load all non-hidden files in a directory. + loader = GenericLoader.from_filesystem("/path/to/dir", glob="**/[!.]*") + + # Load all files in a directory without recursion. + loader = GenericLoader.from_filesystem("/path/to/dir", glob="*") + + Example instantiations to change which parser is used: + + ... code-block:: python + + from langchain.document_loaders.parsers.pdf import PyPDFParser + + # Recursively load all text files in a directory. + loader = GenericLoader.from_filesystem( + "/path/to/dir", + glob="**/*.pdf", + parser=PyPDFParser() + ) + """ + + def __init__( + self, + blob_loader: BlobLoader, + blob_parser: BaseBlobParser, + ) -> None: + """A generic document loader. + + Args: + blob_loader: A blob loader which knows how to yield blobs + blob_parser: A blob parser which knows how to parse blobs into documents + """ + self.blob_loader = blob_loader + self.blob_parser = blob_parser + + def lazy_load( + self, + ) -> Iterator[Document]: + """Load documents lazily. Use this when working at a large scale.""" + for blob in self.blob_loader.yield_blobs(): + yield from self.blob_parser.lazy_parse(blob) + + def load(self) -> List[Document]: + """Load all documents.""" + return list(self.lazy_load()) + + def load_and_split( + self, text_splitter: Optional[TextSplitter] = None + ) -> List[Document]: + """Load all documents and split them into sentences.""" + raise NotImplementedError( + "Loading and splitting is not yet implemented for generic loaders. " + "When they will be implemented they will be added via the initializer. " + "This method should not be used going forward." + ) + + @classmethod + def from_filesystem( + cls, + path: _PathLike, + *, + glob: str = "**/[!.]*", + suffixes: Optional[Sequence[str]] = None, + show_progress: bool = False, + parser: Union[DEFAULT, BaseBlobParser] = "default", + ) -> GenericLoader: + """Create a generic document loader using a filesystem blob loader. + + Args: + path: The path to the directory to load documents from. + glob: The glob pattern to use to find documents. + suffixes: The suffixes to use to filter documents. If None, all files + matching the glob will be loaded. + show_progress: Whether to show a progress bar or not (requires tqdm). + Proxies to the file system loader. + parser: A blob parser which knows how to parse blobs into documents + + Returns: + A generic document loader. + """ + blob_loader = FileSystemBlobLoader( + path, glob=glob, suffixes=suffixes, show_progress=show_progress + ) + if isinstance(parser, str): + blob_parser = get_parser(parser) + else: + blob_parser = parser + return cls(blob_loader, blob_parser) diff --git a/langchain/document_loaders/parsers/generic.py b/langchain/document_loaders/parsers/generic.py index f2458f7d..80545281 100644 --- a/langchain/document_loaders/parsers/generic.py +++ b/langchain/document_loaders/parsers/generic.py @@ -34,6 +34,7 @@ class MimeTypeBasedParser(BaseBlobParser): def __init__( self, handlers: Mapping[str, BaseBlobParser], + *, fallback_parser: Optional[BaseBlobParser] = None, ) -> None: """Define a parser that uses mime-types to determine how to parse a blob. diff --git a/langchain/document_loaders/parsers/registry.py b/langchain/document_loaders/parsers/registry.py new file mode 100644 index 00000000..9c9aad83 --- /dev/null +++ b/langchain/document_loaders/parsers/registry.py @@ -0,0 +1,30 @@ +"""Module includes a registry of default parser configurations.""" +from langchain.document_loaders.base import BaseBlobParser +from langchain.document_loaders.parsers.generic import MimeTypeBasedParser +from langchain.document_loaders.parsers.pdf import PyMuPDFParser +from langchain.document_loaders.parsers.txt import TextParser + + +def _get_default_parser() -> BaseBlobParser: + """Get default mime-type based parser.""" + return MimeTypeBasedParser( + handlers={ + "application/pdf": PyMuPDFParser(), + "text/plain": TextParser(), + }, + fallback_parser=None, + ) + + +_REGISTRY = { + "default": _get_default_parser, +} + +# PUBLIC API + + +def get_parser(parser_name: str) -> BaseBlobParser: + """Get a parser by parser name.""" + if parser_name not in _REGISTRY: + raise ValueError(f"Unknown parser combination: {parser_name}") + return _REGISTRY[parser_name]() diff --git a/langchain/document_loaders/parsers/txt.py b/langchain/document_loaders/parsers/txt.py new file mode 100644 index 00000000..58bed568 --- /dev/null +++ b/langchain/document_loaders/parsers/txt.py @@ -0,0 +1,12 @@ +"""Module for parsing text files..""" +from typing import Iterator + +from langchain.document_loaders.base import BaseBlobParser +from langchain.document_loaders.blob_loaders import Blob +from langchain.schema import Document + + +class TextParser(BaseBlobParser): + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Lazily parse the blob.""" + yield Document(page_content=blob.as_string(), metadata={"source": blob.source}) diff --git a/tests/unit_tests/document_loaders/test_generic_loader.py b/tests/unit_tests/document_loaders/test_generic_loader.py new file mode 100644 index 00000000..9d6a2166 --- /dev/null +++ b/tests/unit_tests/document_loaders/test_generic_loader.py @@ -0,0 +1,114 @@ +"""Test generic loader.""" +import os +import tempfile +from pathlib import Path +from typing import Generator, Iterator + +import pytest + +from langchain.document_loaders.base import BaseBlobParser +from langchain.document_loaders.blob_loaders import Blob, FileSystemBlobLoader +from langchain.document_loaders.generic import GenericLoader +from langchain.schema import Document + + +@pytest.fixture +def toy_dir() -> Generator[Path, None, None]: + """Yield a pre-populated directory to test the blob loader.""" + with tempfile.TemporaryDirectory() as temp_dir: + # Create test.txt + with open(os.path.join(temp_dir, "test.txt"), "w") as test_txt: + test_txt.write("This is a test.txt file.") + + # Create test.html + with open(os.path.join(temp_dir, "test.html"), "w") as test_html: + test_html.write( + "

This is a test.html file.

" + ) + + # Create .hidden_file + with open(os.path.join(temp_dir, ".hidden_file"), "w") as hidden_file: + hidden_file.write("This is a hidden file.") + + # Create some_dir/nested_file.txt + some_dir = os.path.join(temp_dir, "some_dir") + os.makedirs(some_dir) + with open(os.path.join(some_dir, "nested_file.txt"), "w") as nested_file: + nested_file.write("This is a nested_file.txt file.") + + # Create some_dir/other_dir/more_nested.txt + other_dir = os.path.join(some_dir, "other_dir") + os.makedirs(other_dir) + with open(os.path.join(other_dir, "more_nested.txt"), "w") as nested_file: + nested_file.write("This is a more_nested.txt file.") + + yield Path(temp_dir) + + +class AsIsParser(BaseBlobParser): + """Parser created for testing purposes.""" + + def lazy_parse(self, blob: Blob) -> Iterator[Document]: + """Extract the first character of a blob.""" + yield Document(page_content=blob.as_string()) + + +def test__init__(toy_dir: str) -> None: + """Test initialization from init.""" + loader = GenericLoader( + FileSystemBlobLoader(toy_dir, suffixes=[".txt"]), + AsIsParser(), + ) + docs = loader.load() + assert len(docs) == 3 + # Glob order seems to be deterministic with recursion. If this test becomes flaky, + # we can sort the docs by page content. + assert docs[0].page_content == "This is a test.txt file." + + +def test_from_filesystem_classmethod(toy_dir: str) -> None: + """Test generic loader.""" + loader = GenericLoader.from_filesystem( + toy_dir, suffixes=[".txt"], parser=AsIsParser() + ) + docs = loader.load() + assert len(docs) == 3 + # Glob order seems to be deterministic with recursion. If this test becomes flaky, + # we can sort the docs by page content. + assert docs[0].page_content == "This is a test.txt file." + + +def test_from_filesystem_classmethod_with_glob(toy_dir: str) -> None: + """Test that glob parameter is taken into account.""" + loader = GenericLoader.from_filesystem(toy_dir, glob="*.txt", parser=AsIsParser()) + docs = loader.load() + assert len(docs) == 1 + # Glob order seems to be deterministic with recursion. If this test becomes flaky, + # we can sort the docs by page content. + assert docs[0].page_content == "This is a test.txt file." + + +@pytest.mark.requires("tqdm") +def test_from_filesystem_classmethod_show_progress(toy_dir: str) -> None: + """Test that glob parameter is taken into account.""" + loader = GenericLoader.from_filesystem( + toy_dir, glob="*.txt", parser=AsIsParser(), show_progress=True + ) + docs = loader.load() + assert len(docs) == 1 + # Glob order seems to be deterministic with recursion. If this test becomes flaky, + # we can sort the docs by page content. + assert docs[0].page_content == "This is a test.txt file." + + +def test_from_filesystem_using_default_parser(toy_dir: str) -> None: + """Use the default generic parser.""" + loader = GenericLoader.from_filesystem( + toy_dir, + suffixes=[".txt"], + ) + docs = loader.load() + assert len(docs) == 3 + # Glob order seems to be deterministic with recursion. If this test becomes flaky, + # we can sort the docs by page content. + assert docs[0].page_content == "This is a test.txt file."