from typing import Iterator, List, Optional from langchain_community.docstore.document import Document from langchain_community.document_loaders.base import BaseLoader from langchain_community.document_loaders.unstructured import UnstructuredFileIOLoader class AzureAIDataLoader(BaseLoader): """Load from Azure AI Data.""" def __init__(self, url: str, glob: Optional[str] = None): """Initialize with URL to a data asset or storage location .""" self.url = url """URL to the data asset or storage location.""" self.glob_pattern = glob """Optional glob pattern to select files. Defaults to None.""" def load(self) -> List[Document]: """Load documents.""" return list(self.lazy_load()) def lazy_load(self) -> Iterator[Document]: """A lazy loader for Documents.""" try: from azureml.fsspec import AzureMachineLearningFileSystem except ImportError as exc: raise ImportError( "Could not import azureml-fspec package." "Please install it with `pip install azureml-fsspec`." ) from exc fs = AzureMachineLearningFileSystem(self.url) if self.glob_pattern: remote_paths_list = fs.glob(self.glob_pattern) else: remote_paths_list = fs.ls() for remote_path in remote_paths_list: with fs.open(remote_path) as f: loader = UnstructuredFileIOLoader(file=f) yield from loader.load()