2023-12-11 21:53:30 +00:00
|
|
|
import concurrent
|
|
|
|
import logging
|
|
|
|
import random
|
|
|
|
from pathlib import Path
|
2024-02-16 14:42:42 +00:00
|
|
|
from typing import Any, List, Optional, Sequence, Type, Union
|
2023-12-11 21:53:30 +00:00
|
|
|
|
|
|
|
from langchain_core.documents import Document
|
|
|
|
|
|
|
|
from langchain_community.document_loaders.base import BaseLoader
|
|
|
|
from langchain_community.document_loaders.html_bs import BSHTMLLoader
|
|
|
|
from langchain_community.document_loaders.text import TextLoader
|
|
|
|
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
|
|
|
|
|
|
|
|
FILE_LOADER_TYPE = Union[
|
|
|
|
Type[UnstructuredFileLoader], Type[TextLoader], Type[BSHTMLLoader]
|
|
|
|
]
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
|
|
|
|
def _is_visible(p: Path) -> bool:
|
|
|
|
parts = p.parts
|
|
|
|
for _p in parts:
|
|
|
|
if _p.startswith("."):
|
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
|
|
class DirectoryLoader(BaseLoader):
|
|
|
|
"""Load from a directory."""
|
|
|
|
|
|
|
|
def __init__(
|
|
|
|
self,
|
|
|
|
path: str,
|
|
|
|
glob: str = "**/[!.]*",
|
|
|
|
silent_errors: bool = False,
|
|
|
|
load_hidden: bool = False,
|
|
|
|
loader_cls: FILE_LOADER_TYPE = UnstructuredFileLoader,
|
|
|
|
loader_kwargs: Union[dict, None] = None,
|
|
|
|
recursive: bool = False,
|
|
|
|
show_progress: bool = False,
|
|
|
|
use_multithreading: bool = False,
|
|
|
|
max_concurrency: int = 4,
|
|
|
|
*,
|
2024-02-16 14:42:42 +00:00
|
|
|
exclude: Union[Sequence[str], str] = (),
|
2023-12-11 21:53:30 +00:00
|
|
|
sample_size: int = 0,
|
|
|
|
randomize_sample: bool = False,
|
|
|
|
sample_seed: Union[int, None] = None,
|
|
|
|
):
|
|
|
|
"""Initialize with a path to directory and how to glob over it.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
path: Path to directory.
|
|
|
|
glob: Glob pattern to use to find files. Defaults to "**/[!.]*"
|
|
|
|
(all files except hidden).
|
2024-02-16 14:42:42 +00:00
|
|
|
exclude: A pattern or list of patterns to exclude from results.
|
|
|
|
Use glob syntax.
|
2023-12-11 21:53:30 +00:00
|
|
|
silent_errors: Whether to silently ignore errors. Defaults to False.
|
|
|
|
load_hidden: Whether to load hidden files. Defaults to False.
|
|
|
|
loader_cls: Loader class to use for loading files.
|
|
|
|
Defaults to UnstructuredFileLoader.
|
|
|
|
loader_kwargs: Keyword arguments to pass to loader_cls. Defaults to None.
|
|
|
|
recursive: Whether to recursively search for files. Defaults to False.
|
|
|
|
show_progress: Whether to show a progress bar. Defaults to False.
|
|
|
|
use_multithreading: Whether to use multithreading. Defaults to False.
|
|
|
|
max_concurrency: The maximum number of threads to use. Defaults to 4.
|
|
|
|
sample_size: The maximum number of files you would like to load from the
|
|
|
|
directory.
|
2023-12-26 19:22:59 +00:00
|
|
|
randomize_sample: Shuffle the files to get a random sample.
|
|
|
|
sample_seed: set the seed of the random shuffle for reproducibility.
|
2024-02-16 14:42:42 +00:00
|
|
|
|
|
|
|
Examples:
|
|
|
|
|
|
|
|
.. code-block:: python
|
|
|
|
from langchain_community.document_loaders import DirectoryLoader
|
|
|
|
|
|
|
|
# Load all non-hidden files in a directory.
|
|
|
|
loader = DirectoryLoader("/path/to/directory")
|
|
|
|
|
|
|
|
# Load all text files in a directory without recursion.
|
|
|
|
loader = DirectoryLoader("/path/to/directory", glob="*.txt")
|
|
|
|
|
|
|
|
# Recursively load all text files in a directory.
|
|
|
|
loader = DirectoryLoader(
|
|
|
|
"/path/to/directory", glob="*.txt", recursive=True
|
|
|
|
)
|
|
|
|
|
|
|
|
# Load all files in a directory, except for py files.
|
|
|
|
loader = DirectoryLoader("/path/to/directory", exclude="*.py")
|
|
|
|
|
|
|
|
# Load all files in a directory, except for py or pyc files.
|
|
|
|
loader = DirectoryLoader(
|
|
|
|
"/path/to/directory", exclude=["*.py", "*.pyc"]
|
|
|
|
)
|
2023-12-11 21:53:30 +00:00
|
|
|
"""
|
|
|
|
if loader_kwargs is None:
|
|
|
|
loader_kwargs = {}
|
2024-02-16 14:42:42 +00:00
|
|
|
if isinstance(exclude, str):
|
|
|
|
exclude = (exclude,)
|
2023-12-11 21:53:30 +00:00
|
|
|
self.path = path
|
|
|
|
self.glob = glob
|
2024-02-16 14:42:42 +00:00
|
|
|
self.exclude = exclude
|
2023-12-11 21:53:30 +00:00
|
|
|
self.load_hidden = load_hidden
|
|
|
|
self.loader_cls = loader_cls
|
|
|
|
self.loader_kwargs = loader_kwargs
|
|
|
|
self.silent_errors = silent_errors
|
|
|
|
self.recursive = recursive
|
|
|
|
self.show_progress = show_progress
|
|
|
|
self.use_multithreading = use_multithreading
|
|
|
|
self.max_concurrency = max_concurrency
|
|
|
|
self.sample_size = sample_size
|
|
|
|
self.randomize_sample = randomize_sample
|
|
|
|
self.sample_seed = sample_seed
|
|
|
|
|
|
|
|
def load_file(
|
|
|
|
self, item: Path, path: Path, docs: List[Document], pbar: Optional[Any]
|
|
|
|
) -> None:
|
|
|
|
"""Load a file.
|
|
|
|
|
|
|
|
Args:
|
|
|
|
item: File path.
|
|
|
|
path: Directory path.
|
|
|
|
docs: List of documents to append to.
|
|
|
|
pbar: Progress bar. Defaults to None.
|
|
|
|
|
|
|
|
"""
|
|
|
|
if item.is_file():
|
|
|
|
if _is_visible(item.relative_to(path)) or self.load_hidden:
|
|
|
|
try:
|
|
|
|
logger.debug(f"Processing file: {str(item)}")
|
|
|
|
sub_docs = self.loader_cls(str(item), **self.loader_kwargs).load()
|
|
|
|
docs.extend(sub_docs)
|
|
|
|
except Exception as e:
|
|
|
|
if self.silent_errors:
|
|
|
|
logger.warning(f"Error loading file {str(item)}: {e}")
|
|
|
|
else:
|
Report which file was errored on in DirectoryLoader (#16790)
The current implementation leaves it up to the particular file loader
implementation to report the file on which an error was encountered - in
my case pdfminer was simply saying it could not parse a file as a PDF,
but I didn't know which of my hundreds of files it was failing on.
No reason not to log the particular item on which an error was
encountered, and it should be an immense debugging assistant.
<!-- Thank you for contributing to LangChain!
Please title your PR "<package>: <description>", where <package> is
whichever of langchain, community, core, experimental, etc. is being
modified.
Replace this entire comment with:
- **Description:** a description of the change,
- **Issue:** the issue # it fixes if applicable,
- **Dependencies:** any dependencies required for this change,
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!
Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` from the root
of the package you've modified to check this locally.
See contribution guidelines for more information on how to write/run
tests, lint, etc: https://python.langchain.com/docs/contributing/
If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.
If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
-->
2024-01-30 17:14:58 +00:00
|
|
|
logger.error(f"Error loading file {str(item)}")
|
2023-12-11 21:53:30 +00:00
|
|
|
raise e
|
|
|
|
finally:
|
|
|
|
if pbar:
|
|
|
|
pbar.update(1)
|
|
|
|
|
|
|
|
def load(self) -> List[Document]:
|
|
|
|
"""Load documents."""
|
|
|
|
p = Path(self.path)
|
|
|
|
if not p.exists():
|
|
|
|
raise FileNotFoundError(f"Directory not found: '{self.path}'")
|
|
|
|
if not p.is_dir():
|
|
|
|
raise ValueError(f"Expected directory, got file: '{self.path}'")
|
|
|
|
|
|
|
|
docs: List[Document] = []
|
2024-02-16 14:42:42 +00:00
|
|
|
|
|
|
|
paths = p.rglob(self.glob) if self.recursive else p.glob(self.glob)
|
|
|
|
items = [
|
|
|
|
path
|
|
|
|
for path in paths
|
|
|
|
if not (self.exclude and any(path.match(glob) for glob in self.exclude))
|
|
|
|
]
|
2023-12-11 21:53:30 +00:00
|
|
|
|
|
|
|
if self.sample_size > 0:
|
|
|
|
if self.randomize_sample:
|
2024-02-13 00:51:06 +00:00
|
|
|
randomizer = random.Random(
|
|
|
|
self.sample_seed if self.sample_seed else None
|
2023-12-11 21:53:30 +00:00
|
|
|
)
|
2024-02-13 00:51:06 +00:00
|
|
|
randomizer.shuffle(items)
|
2023-12-11 21:53:30 +00:00
|
|
|
items = items[: min(len(items), self.sample_size)]
|
|
|
|
|
|
|
|
pbar = None
|
|
|
|
if self.show_progress:
|
|
|
|
try:
|
|
|
|
from tqdm import tqdm
|
|
|
|
|
|
|
|
pbar = tqdm(total=len(items))
|
|
|
|
except ImportError as e:
|
|
|
|
logger.warning(
|
|
|
|
"To log the progress of DirectoryLoader you need to install tqdm, "
|
|
|
|
"`pip install tqdm`"
|
|
|
|
)
|
|
|
|
if self.silent_errors:
|
|
|
|
logger.warning(e)
|
|
|
|
else:
|
|
|
|
raise ImportError(
|
|
|
|
"To log the progress of DirectoryLoader "
|
|
|
|
"you need to install tqdm, "
|
|
|
|
"`pip install tqdm`"
|
|
|
|
)
|
|
|
|
|
|
|
|
if self.use_multithreading:
|
|
|
|
with concurrent.futures.ThreadPoolExecutor(
|
|
|
|
max_workers=self.max_concurrency
|
|
|
|
) as executor:
|
|
|
|
executor.map(lambda i: self.load_file(i, p, docs, pbar), items)
|
|
|
|
else:
|
|
|
|
for i in items:
|
|
|
|
self.load_file(i, p, docs, pbar)
|
|
|
|
|
|
|
|
if pbar:
|
|
|
|
pbar.close()
|
|
|
|
|
|
|
|
return docs
|