diff --git a/docs/modules/indexes/document_loaders/examples/obsidian.ipynb b/docs/modules/indexes/document_loaders/examples/obsidian.ipynb index e92b9c2b96..c412f946a7 100644 --- a/docs/modules/indexes/document_loaders/examples/obsidian.ipynb +++ b/docs/modules/indexes/document_loaders/examples/obsidian.ipynb @@ -1,6 +1,7 @@ { "cells": [ { + "attachments": {}, "cell_type": "markdown", "id": "1dc7df1d", "metadata": {}, @@ -8,7 +9,9 @@ "# Obsidian\n", "This notebook covers how to load documents from an Obsidian database.\n", "\n", - "Since Obsidian is just stored on disk as a folder of Markdown files, the loader just takes a path to this directory." + "Since Obsidian is just stored on disk as a folder of Markdown files, the loader just takes a path to this directory.\n", + "\n", + "Obsidian files also sometimes contain [metadata](https://help.obsidian.md/Editing+and+formatting/Metadata) which is a YAML block at the top of the file. These values will be added to the document's metadata. (`ObsidianLoader` can also be passed a `collect_metadata=False` argument to disable this behavior.)" ] }, { diff --git a/langchain/document_loaders/obsidian.py b/langchain/document_loaders/obsidian.py index df5a5d7e44..982b60bd45 100644 --- a/langchain/document_loaders/obsidian.py +++ b/langchain/document_loaders/obsidian.py @@ -1,4 +1,5 @@ """Loader that loads Obsidian directory dump.""" +import re from pathlib import Path from typing import List @@ -9,10 +10,38 @@ from langchain.document_loaders.base import BaseLoader class ObsidianLoader(BaseLoader): """Loader that loads Obsidian files from disk.""" - def __init__(self, path: str, encoding: str = "UTF-8"): + FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL) + + def __init__( + self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True + ): """Initialize with path.""" self.file_path = path self.encoding = encoding + self.collect_metadata = collect_metadata + + def _parse_front_matter(self, content: str) -> dict: + """Parse front matter metadata from the content and return it as a dict.""" + if not self.collect_metadata: + return {} + match = self.FRONT_MATTER_REGEX.search(content) + front_matter = {} + if match: + lines = match.group(1).split("\n") + for line in lines: + if ":" in line: + key, value = line.split(":", 1) + front_matter[key.strip()] = value.strip() + else: + # Skip lines without a colon + continue + return front_matter + + def _remove_front_matter(self, content: str) -> str: + """Remove front matter metadata from the given content.""" + if not self.collect_metadata: + return content + return self.FRONT_MATTER_REGEX.sub("", content) def load(self) -> List[Document]: """Load documents.""" @@ -21,6 +50,10 @@ class ObsidianLoader(BaseLoader): for p in ps: with open(p, encoding=self.encoding) as f: text = f.read() - metadata = {"source": str(p)} + + front_matter = self._parse_front_matter(text) + text = self._remove_front_matter(text) + metadata = {"source": str(p.name), "path": str(p), **front_matter} docs.append(Document(page_content=text, metadata=metadata)) + return docs