Adds progress bar using tqdm to directory_loader (#3349)

Approach copied from `WebBaseLoader`. Assumes the user doesn't have
`tqdm` installed.
This commit is contained in:
jrhe 2023-04-25 05:42:42 +01:00 committed by GitHub
parent 344e3508b1
commit 980cc41709
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 72 additions and 1 deletions

View File

@ -68,6 +68,51 @@
"len(docs)" "len(docs)"
] ]
}, },
{
"attachments": {},
"cell_type": "markdown",
"id": "e633d62f",
"metadata": {},
"source": [
"## Show a progress bar"
]
},
{
"attachments": {},
"cell_type": "markdown",
"id": "43911860",
"metadata": {},
"source": [
"By default a progress bar will not be shown. To show a progress bar, install the `tqdm` library (e.g. `pip install tqdm`), and set the `show_progress` parameter to `True`."
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "bb93daac",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: tqdm in /Users/jon/.pyenv/versions/3.9.16/envs/microbiome-app/lib/python3.9/site-packages (4.65.0)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"0it [00:00, ?it/s]\n"
]
}
],
"source": [
"%pip install tqdm\n",
"loader = DirectoryLoader('../', glob=\"**/*.md\", show_progress=True)\n",
"docs = loader.load()"
]
},
{ {
"cell_type": "markdown", "cell_type": "markdown",
"id": "c5652850", "id": "c5652850",

View File

@ -35,6 +35,7 @@ class DirectoryLoader(BaseLoader):
loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader, loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader,
loader_kwargs: Union[dict, None] = None, loader_kwargs: Union[dict, None] = None,
recursive: bool = False, recursive: bool = False,
show_progress: bool = False,
): ):
"""Initialize with path to directory and how to glob over it.""" """Initialize with path to directory and how to glob over it."""
if loader_kwargs is None: if loader_kwargs is None:
@ -46,12 +47,30 @@ class DirectoryLoader(BaseLoader):
self.loader_kwargs = loader_kwargs self.loader_kwargs = loader_kwargs
self.silent_errors = silent_errors self.silent_errors = silent_errors
self.recursive = recursive self.recursive = recursive
self.show_progress = show_progress
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load documents.""" """Load documents."""
p = Path(self.path) p = Path(self.path)
docs = [] docs = []
items = p.rglob(self.glob) if self.recursive else p.glob(self.glob) items = list(p.rglob(self.glob) if self.recursive else p.glob(self.glob))
pbar = None
if self.show_progress:
try:
from tqdm import tqdm
pbar = tqdm(total=len(items))
except ImportError as e:
logger.warning(
"To log the progress of DirectoryLoader you need to install tqdm, "
"`pip install tqdm`"
)
if self.silent_errors:
logger.warning(e)
else:
raise e
for i in items: for i in items:
if i.is_file(): if i.is_file():
if _is_visible(i.relative_to(p)) or self.load_hidden: if _is_visible(i.relative_to(p)) or self.load_hidden:
@ -63,4 +82,11 @@ class DirectoryLoader(BaseLoader):
logger.warning(e) logger.warning(e)
else: else:
raise e raise e
finally:
if pbar:
pbar.update(1)
if pbar:
pbar.close()
return docs return docs