"""Simple reader that reads files of different formats from a directory.""" import logging from pathlib import Path from typing import Callable, Dict, List, Optional, Union from parser.file.base import BaseReader from parser.file.base_parser import BaseParser from parser.file.docs_parser import DocxParser, PDFParser from parser.file.epub_parser import EpubParser from parser.file.html_parser import HTMLParser from parser.file.markdown_parser import MarkdownParser from parser.file.rst_parser import RstParser from parser.file.tabular_parser import PandasCSVParser from parser.schema.base import Document DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = { ".pdf": PDFParser(), ".docx": DocxParser(), ".csv": PandasCSVParser(), ".epub": EpubParser(), ".md": MarkdownParser(), ".rst": RstParser(), ".html": HTMLParser(), } class SimpleDirectoryReader(BaseReader): """Simple directory reader. Can read files into separate documents, or concatenates files into one document text. Args: input_dir (str): Path to the directory. input_files (List): List of file paths to read (Optional; overrides input_dir) exclude_hidden (bool): Whether to exclude hidden files (dotfiles). errors (str): how encoding and decoding errors are to be handled, see https://docs.python.org/3/library/functions.html#open recursive (bool): Whether to recursively search in subdirectories. False by default. required_exts (Optional[List[str]]): List of required extensions. Default is None. file_extractor (Optional[Dict[str, BaseParser]]): A mapping of file extension to a BaseParser class that specifies how to convert that file to text. See DEFAULT_FILE_EXTRACTOR. num_files_limit (Optional[int]): Maximum number of files to read. Default is None. file_metadata (Optional[Callable[str, Dict]]): A function that takes in a filename and returns a Dict of metadata for the Document. Default is None. """ def __init__( self, input_dir: Optional[str] = None, input_files: Optional[List] = None, exclude_hidden: bool = True, errors: str = "ignore", recursive: bool = True, required_exts: Optional[List[str]] = None, file_extractor: Optional[Dict[str, BaseParser]] = None, num_files_limit: Optional[int] = None, file_metadata: Optional[Callable[[str], Dict]] = None, ) -> None: """Initialize with parameters.""" super().__init__() if not input_dir and not input_files: raise ValueError("Must provide either `input_dir` or `input_files`.") self.errors = errors self.recursive = recursive self.exclude_hidden = exclude_hidden self.required_exts = required_exts self.num_files_limit = num_files_limit if input_files: self.input_files = [] for path in input_files: input_file = Path(path) self.input_files.append(input_file) elif input_dir: self.input_dir = Path(input_dir) self.input_files = self._add_files(self.input_dir) self.file_extractor = file_extractor or DEFAULT_FILE_EXTRACTOR self.file_metadata = file_metadata def _add_files(self, input_dir: Path) -> List[Path]: """Add files.""" input_files = sorted(input_dir.iterdir()) new_input_files = [] dirs_to_explore = [] for input_file in input_files: if input_file.is_dir(): if self.recursive: dirs_to_explore.append(input_file) elif self.exclude_hidden and input_file.name.startswith("."): continue elif ( self.required_exts is not None and input_file.suffix not in self.required_exts ): continue else: new_input_files.append(input_file) for dir_to_explore in dirs_to_explore: sub_input_files = self._add_files(dir_to_explore) new_input_files.extend(sub_input_files) if self.num_files_limit is not None and self.num_files_limit > 0: new_input_files = new_input_files[0 : self.num_files_limit] # print total number of files added logging.debug( f"> [SimpleDirectoryReader] Total files added: {len(new_input_files)}" ) return new_input_files def load_data(self, concatenate: bool = False) -> List[Document]: """Load data from the input directory. Args: concatenate (bool): whether to concatenate all files into one document. If set to True, file metadata is ignored. False by default. Returns: List[Document]: A list of documents. """ data: Union[str, List[str]] = "" data_list: List[str] = [] metadata_list = [] for input_file in self.input_files: if input_file.suffix in self.file_extractor: parser = self.file_extractor[input_file.suffix] if not parser.parser_config_set: parser.init_parser() data = parser.parse_file(input_file, errors=self.errors) else: # do standard read with open(input_file, "r", errors=self.errors) as f: data = f.read() if isinstance(data, List): data_list.extend(data) else: data_list.append(str(data)) if self.file_metadata is not None: metadata_list.append(self.file_metadata(str(input_file))) if concatenate: return [Document("\n".join(data_list))] elif self.file_metadata is not None: return [Document(d, extra_info=m) for d, m in zip(data_list, metadata_list)] else: return [Document(d) for d in data_list]