diff --git a/langchain/document_loaders/unstructured.py b/langchain/document_loaders/unstructured.py index 079af736..97137e08 100644 --- a/langchain/document_loaders/unstructured.py +++ b/langchain/document_loaders/unstructured.py @@ -33,12 +33,19 @@ class UnstructuredFileLoader(BaseLoader): def load(self) -> List[Document]: """Load file.""" elements = self._get_elements() - metadata = {"source": self.file_path} if self.mode == "elements": - docs = [ - Document(page_content=str(el), metadata=metadata) for el in elements - ] + docs: List[Document] = list() + for element in elements: + metadata = {"source": self.file_path} + # NOTE(MthwRobinson) - the attribute check is for backward compatibility + # with unstructured<0.4.9. The metadata attributed was added in 0.4.9. + if hasattr(element, "metadata"): + metadata.update(element.metadata.to_dict()) + if hasattr(element, "category"): + metadata["category"] = element.category + docs.append(Document(page_content=str(element), metadata=metadata)) elif self.mode == "single": + metadata = {"source": self.file_path} text = "\n\n".join([str(el) for el in elements]) docs = [Document(page_content=text, metadata=metadata)] else: