From 980cc4170900f9bfd0b60648fc1bf477ba52f6b3 Mon Sep 17 00:00:00 2001 From: jrhe <4038905+jrhe@users.noreply.github.com> Date: Tue, 25 Apr 2023 05:42:42 +0100 Subject: [PATCH] Adds progress bar using tqdm to directory_loader (#3349) Approach copied from `WebBaseLoader`. Assumes the user doesn't have `tqdm` installed. --- .../examples/directory_loader.ipynb | 45 +++++++++++++++++++ langchain/document_loaders/directory.py | 28 +++++++++++- 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb b/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb index 7a9b4e6f..8bec57a3 100644 --- a/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb +++ b/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb @@ -68,6 +68,51 @@ "len(docs)" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "e633d62f", + "metadata": {}, + "source": [ + "## Show a progress bar" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "43911860", + "metadata": {}, + "source": [ + "By default a progress bar will not be shown. To show a progress bar, install the `tqdm` library (e.g. `pip install tqdm`), and set the `show_progress` parameter to `True`." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "bb93daac", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: tqdm in /Users/jon/.pyenv/versions/3.9.16/envs/microbiome-app/lib/python3.9/site-packages (4.65.0)\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "0it [00:00, ?it/s]\n" + ] + } + ], + "source": [ + "%pip install tqdm\n", + "loader = DirectoryLoader('../', glob=\"**/*.md\", show_progress=True)\n", + "docs = loader.load()" + ] + }, { "cell_type": "markdown", "id": "c5652850", diff --git a/langchain/document_loaders/directory.py b/langchain/document_loaders/directory.py index ec121d60..c180a3cd 100644 --- a/langchain/document_loaders/directory.py +++ b/langchain/document_loaders/directory.py @@ -35,6 +35,7 @@ class DirectoryLoader(BaseLoader): loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader, loader_kwargs: Union[dict, None] = None, recursive: bool = False, + show_progress: bool = False, ): """Initialize with path to directory and how to glob over it.""" if loader_kwargs is None: @@ -46,12 +47,30 @@ class DirectoryLoader(BaseLoader): self.loader_kwargs = loader_kwargs self.silent_errors = silent_errors self.recursive = recursive + self.show_progress = show_progress def load(self) -> List[Document]: """Load documents.""" p = Path(self.path) docs = [] - items = p.rglob(self.glob) if self.recursive else p.glob(self.glob) + items = list(p.rglob(self.glob) if self.recursive else p.glob(self.glob)) + + pbar = None + if self.show_progress: + try: + from tqdm import tqdm + + pbar = tqdm(total=len(items)) + except ImportError as e: + logger.warning( + "To log the progress of DirectoryLoader you need to install tqdm, " + "`pip install tqdm`" + ) + if self.silent_errors: + logger.warning(e) + else: + raise e + for i in items: if i.is_file(): if _is_visible(i.relative_to(p)) or self.load_hidden: @@ -63,4 +82,11 @@ class DirectoryLoader(BaseLoader): logger.warning(e) else: raise e + finally: + if pbar: + pbar.update(1) + + if pbar: + pbar.close() + return docs