diff --git a/docs/modules/document_loaders/examples/directory_loader.ipynb b/docs/modules/document_loaders/examples/directory_loader.ipynb index 5ffe1a84..40653655 100644 --- a/docs/modules/document_loaders/examples/directory_loader.ipynb +++ b/docs/modules/document_loaders/examples/directory_loader.ipynb @@ -6,7 +6,7 @@ "metadata": {}, "source": [ "# Directory Loader\n", - "This covers how to use the DirectoryLoader to load all documents in a directory. Under the hood, this uses the [UnstructuredLoader](./unstructured_file.ipynb)" + "This covers how to use the DirectoryLoader to load all documents in a directory. Under the hood, by default this uses the [UnstructuredLoader](./unstructured_file.ipynb)" ] }, { @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 2, "id": "891fe56f", "metadata": {}, "outputs": [], @@ -39,7 +39,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "id": "addfe9cf", "metadata": {}, "outputs": [], @@ -49,7 +49,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 4, "id": "b042086d", "metadata": {}, "outputs": [ @@ -59,7 +59,67 @@ "1" ] }, - "execution_count": 9, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(docs)" + ] + }, + { + "cell_type": "markdown", + "id": "c5652850", + "metadata": {}, + "source": [ + "## Change loader class\n", + "By default this uses the UnstructuredLoader class. However, you can change up the type of loader pretty easily." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "81c92da3", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import TextLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ab38ee36", + "metadata": {}, + "outputs": [], + "source": [ + "loader = DirectoryLoader('../', glob=\"**/*.md\", loader_cls=TextLoader)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "25c8740f", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "38337763", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1" + ] + }, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -71,7 +131,7 @@ { "cell_type": "code", "execution_count": null, - "id": "cbc8256b", + "id": "984c8429", "metadata": {}, "outputs": [], "source": [] diff --git a/langchain/document_loaders/directory.py b/langchain/document_loaders/directory.py index f90e6787..74f24dd2 100644 --- a/langchain/document_loaders/directory.py +++ b/langchain/document_loaders/directory.py @@ -1,19 +1,42 @@ """Loading logic for loading documents from a directory.""" +import logging from pathlib import Path -from typing import List +from typing import List, Type, Union from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader +from langchain.document_loaders.text import TextLoader from langchain.document_loaders.unstructured import UnstructuredFileLoader +FILE_LOADER_TYPE = Union[Type[UnstructuredFileLoader], Type[TextLoader]] +logger = logging.getLogger(__file__) + + +def _is_visible(p: Path) -> bool: + parts = p.parts + for _p in parts: + if _p.startswith("."): + return False + return True + class DirectoryLoader(BaseLoader): """Loading logic for loading documents from a directory.""" - def __init__(self, path: str, glob: str = "**/*"): + def __init__( + self, + path: str, + glob: str = "**/[!.]*", + silent_errors: bool = False, + load_hidden: bool = False, + loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader, + ): """Initialize with path to directory and how to glob over it.""" self.path = path self.glob = glob + self.load_hidden = load_hidden + self.loader_cls = loader_cls + self.silent_errors = silent_errors def load(self) -> List[Document]: """Load documents.""" @@ -21,6 +44,13 @@ class DirectoryLoader(BaseLoader): docs = [] for i in p.glob(self.glob): if i.is_file(): - sub_docs = UnstructuredFileLoader(str(i)).load() - docs.extend(sub_docs) + if _is_visible(i.relative_to(p)) or self.load_hidden: + try: + sub_docs = self.loader_cls(str(i)).load() + docs.extend(sub_docs) + except Exception as e: + if self.silent_errors: + logger.warning(e) + else: + raise e return docs