langchain/libs/community/langchain_community/document_loaders/directory.py

import concurrent
import logging
import random
from pathlib import Path
from typing import Any, List, Optional, Type, Union

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader
from langchain_community.document_loaders.html_bs import BSHTMLLoader
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader

FILE_LOADER_TYPE = Union[
    Type[UnstructuredFileLoader], Type[TextLoader], Type[BSHTMLLoader]
]
logger = logging.getLogger(__name__)


def _is_visible(p: Path) -> bool:
    parts = p.parts
    for _p in parts:
        if _p.startswith("."):
            return False
    return True


class DirectoryLoader(BaseLoader):
    """Load from a directory."""

    def __init__(
        self,
        path: str,
        glob: str = "**/[!.]*",
        silent_errors: bool = False,
        load_hidden: bool = False,
        loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader,
        loader_kwargs: Union[dict, None] = None,
        recursive: bool = False,
        show_progress: bool = False,
        use_multithreading: bool = False,
        max_concurrency: int = 4,
        *,
        sample_size: int = 0,
        randomize_sample: bool = False,
        sample_seed: Union[int, None] = None,
    ):
        """Initialize with a path to directory and how to glob over it.

        Args:
            path: Path to directory.
            glob: Glob pattern to use to find files. Defaults to "**/[!.]*"
               (all files except hidden).
            silent_errors: Whether to silently ignore errors. Defaults to False.
            load_hidden: Whether to load hidden files. Defaults to False.
            loader_cls: Loader class to use for loading files.
              Defaults to UnstructuredFileLoader.
            loader_kwargs: Keyword arguments to pass to loader_cls. Defaults to None.
            recursive: Whether to recursively search for files. Defaults to False.
            show_progress: Whether to show a progress bar. Defaults to False.
            use_multithreading: Whether to use multithreading. Defaults to False.
            max_concurrency: The maximum number of threads to use. Defaults to 4.
            sample_size: The maximum number of files you would like to load from the
                directory.
            randomize_sample: Shuffle the files to get a random sample.
            sample_seed: set the seed of the random shuffle for reproducibility.
        """
        if loader_kwargs is None:
            loader_kwargs = {}
        self.path = path
        self.glob = glob
        self.load_hidden = load_hidden
        self.loader_cls = loader_cls
        self.loader_kwargs = loader_kwargs
        self.silent_errors = silent_errors
        self.recursive = recursive
        self.show_progress = show_progress
        self.use_multithreading = use_multithreading
        self.max_concurrency = max_concurrency
        self.sample_size = sample_size
        self.randomize_sample = randomize_sample
        self.sample_seed = sample_seed

    def load_file(
        self, item: Path, path: Path, docs: List[Document], pbar: Optional[Any]
    ) -> None:
        """Load a file.

        Args:
            item: File path.
            path: Directory path.
            docs: List of documents to append to.
            pbar: Progress bar. Defaults to None.

        """
        if item.is_file():
            if _is_visible(item.relative_to(path)) or self.load_hidden:
                try:
                    logger.debug(f"Processing file: {str(item)}")
                    sub_docs = self.loader_cls(str(item), **self.loader_kwargs).load()
                    docs.extend(sub_docs)
                except Exception as e:
                    if self.silent_errors:
                        logger.warning(f"Error loading file {str(item)}: {e}")
                    else:
                        logger.error(f"Error loading file {str(item)}")
                        raise e
                finally:
                    if pbar:
                        pbar.update(1)

    def load(self) -> List[Document]:
        """Load documents."""
        p = Path(self.path)
        if not p.exists():
            raise FileNotFoundError(f"Directory not found: '{self.path}'")
        if not p.is_dir():
            raise ValueError(f"Expected directory, got file: '{self.path}'")

        docs: List[Document] = []
        items = list(p.rglob(self.glob) if self.recursive else p.glob(self.glob))

        if self.sample_size > 0:
            if self.randomize_sample:
                randomizer = (
                    random.Random(self.sample_seed) if self.sample_seed else random
                )
                randomizer.shuffle(items)  # type: ignore
            items = items[: min(len(items), self.sample_size)]

        pbar = None
        if self.show_progress:
            try:
                from tqdm import tqdm

                pbar = tqdm(total=len(items))
            except ImportError as e:
                logger.warning(
                    "To log the progress of DirectoryLoader you need to install tqdm, "
                    "`pip install tqdm`"
                )
                if self.silent_errors:
                    logger.warning(e)
                else:
                    raise ImportError(
                        "To log the progress of DirectoryLoader "
                        "you need to install tqdm, "
                        "`pip install tqdm`"
                    )

        if self.use_multithreading:
            with concurrent.futures.ThreadPoolExecutor(
                max_workers=self.max_concurrency
            ) as executor:
                executor.map(lambda i: self.load_file(i, p, docs, pbar), items)
        else:
            for i in items:
                self.load_file(i, p, docs, pbar)

        if pbar:
            pbar.close()

        return docs
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`import concurrent`
			`import logging`
			`import random`
			`from pathlib import Path`
			`from typing import Any, List, Optional, Type, Union`

			`from langchain_core.documents import Document`

			`from langchain_community.document_loaders.base import BaseLoader`
			`from langchain_community.document_loaders.html_bs import BSHTMLLoader`
			`from langchain_community.document_loaders.text import TextLoader`
			`from langchain_community.document_loaders.unstructured import UnstructuredFileLoader`

			`FILE_LOADER_TYPE = Union[`
			`Type[UnstructuredFileLoader], Type[TextLoader], Type[BSHTMLLoader]`
			`]`
			`logger = logging.getLogger(__name__)`


			`def _is_visible(p: Path) -> bool:`
			`parts = p.parts`
			`for _p in parts:`
			`if _p.startswith("."):`
			`return False`
			`return True`


			`class DirectoryLoader(BaseLoader):`
			`"""Load from a directory."""`

			`def __init__(`
			`self,`
			`path: str,`
			`glob: str = "*/[!.]",`
			`silent_errors: bool = False,`
			`load_hidden: bool = False,`
			`loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader,`
			`loader_kwargs: Union[dict, None] = None,`
			`recursive: bool = False,`
			`show_progress: bool = False,`
			`use_multithreading: bool = False,`
			`max_concurrency: int = 4,`
			`*,`
			`sample_size: int = 0,`
			`randomize_sample: bool = False,`
			`sample_seed: Union[int, None] = None,`
			`):`
			`"""Initialize with a path to directory and how to glob over it.`

			`Args:`
			`path: Path to directory.`
			`glob: Glob pattern to use to find files. Defaults to "*/[!.]"`
			`(all files except hidden).`
			`silent_errors: Whether to silently ignore errors. Defaults to False.`
			`load_hidden: Whether to load hidden files. Defaults to False.`
			`loader_cls: Loader class to use for loading files.`
			`Defaults to UnstructuredFileLoader.`
			`loader_kwargs: Keyword arguments to pass to loader_cls. Defaults to None.`
			`recursive: Whether to recursively search for files. Defaults to False.`
			`show_progress: Whether to show a progress bar. Defaults to False.`
			`use_multithreading: Whether to use multithreading. Defaults to False.`
			`max_concurrency: The maximum number of threads to use. Defaults to 4.`
			`sample_size: The maximum number of files you would like to load from the`
			`directory.`
community: correct spelling mistakes of "Suffle" and "reporoducibility" (#15172) - Description: Correct spelling mistakes of "Suffle" and "reporoducibility" in `DirectoryLoader` class - Issue: N/A - Dependencies: N/A - Twitter handle: N/A 2023-12-26 19:22:59 +00:00			`randomize_sample: Shuffle the files to get a random sample.`
			`sample_seed: set the seed of the random shuffle for reproducibility.`
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`"""`
			`if loader_kwargs is None:`
			`loader_kwargs = {}`
			`self.path = path`
			`self.glob = glob`
			`self.load_hidden = load_hidden`
			`self.loader_cls = loader_cls`
			`self.loader_kwargs = loader_kwargs`
			`self.silent_errors = silent_errors`
			`self.recursive = recursive`
			`self.show_progress = show_progress`
			`self.use_multithreading = use_multithreading`
			`self.max_concurrency = max_concurrency`
			`self.sample_size = sample_size`
			`self.randomize_sample = randomize_sample`
			`self.sample_seed = sample_seed`

			`def load_file(`
			`self, item: Path, path: Path, docs: List[Document], pbar: Optional[Any]`
			`) -> None:`
			`"""Load a file.`

			`Args:`
			`item: File path.`
			`path: Directory path.`
			`docs: List of documents to append to.`
			`pbar: Progress bar. Defaults to None.`

			`"""`
			`if item.is_file():`
			`if _is_visible(item.relative_to(path)) or self.load_hidden:`
			`try:`
			`logger.debug(f"Processing file: {str(item)}")`
			`sub_docs = self.loader_cls(str(item), **self.loader_kwargs).load()`
			`docs.extend(sub_docs)`
			`except Exception as e:`
			`if self.silent_errors:`
			`logger.warning(f"Error loading file {str(item)}: {e}")`
			`else:`
Report which file was errored on in DirectoryLoader (#16790) The current implementation leaves it up to the particular file loader implementation to report the file on which an error was encountered - in my case pdfminer was simply saying it could not parse a file as a PDF, but I didn't know which of my hundreds of files it was failing on. No reason not to log the particular item on which an error was encountered, and it should be an immense debugging assistant. <!-- Thank you for contributing to LangChain! Please title your PR "<package>: <description>", where <package> is whichever of langchain, community, core, experimental, etc. is being modified. Replace this entire comment with: - Description: a description of the change, - Issue: the issue # it fixes if applicable, - Dependencies: any dependencies required for this change, - Twitter handle: we announce bigger features on Twitter. If your PR gets announced, and you'd like a mention, we'll gladly shout you out! Please make sure your PR is passing linting and testing before submitting. Run `make format`, `make lint` and `make test` from the root of the package you've modified to check this locally. See contribution guidelines for more information on how to write/run tests, lint, etc: https://python.langchain.com/docs/contributing/ If you're adding a new integration, please include: 1. a test for the integration, preferably unit tests that do not rely on network access, 2. an example notebook showing its use. It lives in `docs/docs/integrations` directory. If no one reviews your PR within a few days, please @-mention one of @baskaryan, @eyurtsev, @hwchase17. --> 2024-01-30 17:14:58 +00:00			`logger.error(f"Error loading file {str(item)}")`
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`raise e`
			`finally:`
			`if pbar:`
			`pbar.update(1)`

			`def load(self) -> List[Document]:`
			`"""Load documents."""`
			`p = Path(self.path)`
			`if not p.exists():`
			`raise FileNotFoundError(f"Directory not found: '{self.path}'")`
			`if not p.is_dir():`
			`raise ValueError(f"Expected directory, got file: '{self.path}'")`

			`docs: List[Document] = []`
			`items = list(p.rglob(self.glob) if self.recursive else p.glob(self.glob))`

			`if self.sample_size > 0:`
			`if self.randomize_sample:`
			`randomizer = (`
			`random.Random(self.sample_seed) if self.sample_seed else random`
			`)`
			`randomizer.shuffle(items) # type: ignore`
			`items = items[: min(len(items), self.sample_size)]`

			`pbar = None`
			`if self.show_progress:`
			`try:`
			`from tqdm import tqdm`

			`pbar = tqdm(total=len(items))`
			`except ImportError as e:`
			`logger.warning(`
			`"To log the progress of DirectoryLoader you need to install tqdm, "`
			"`pip install tqdm`"
			`)`
			`if self.silent_errors:`
			`logger.warning(e)`
			`else:`
			`raise ImportError(`
			`"To log the progress of DirectoryLoader "`
			`"you need to install tqdm, "`
			"`pip install tqdm`"
			`)`

			`if self.use_multithreading:`
			`with concurrent.futures.ThreadPoolExecutor(`
			`max_workers=self.max_concurrency`
			`) as executor:`
			`executor.map(lambda i: self.load_file(i, p, docs, pbar), items)`
			`else:`
			`for i in items:`
			`self.load_file(i, p, docs, pbar)`

			`if pbar:`
			`pbar.close()`

			`return docs`