Harrison/obsidian (#3060)

Co-authored-by: Ben Hofferber <hofferber.ben@gmail.com>
This commit is contained in:
Harrison Chase 2023-04-17 21:57:32 -07:00 committed by GitHub
parent 93c0514105
commit 1920536d99
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 39 additions and 3 deletions

View File

@ -1,6 +1,7 @@
{ {
"cells": [ "cells": [
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"id": "1dc7df1d", "id": "1dc7df1d",
"metadata": {}, "metadata": {},
@ -8,7 +9,9 @@
"# Obsidian\n", "# Obsidian\n",
"This notebook covers how to load documents from an Obsidian database.\n", "This notebook covers how to load documents from an Obsidian database.\n",
"\n", "\n",
"Since Obsidian is just stored on disk as a folder of Markdown files, the loader just takes a path to this directory." "Since Obsidian is just stored on disk as a folder of Markdown files, the loader just takes a path to this directory.\n",
"\n",
"Obsidian files also sometimes contain [metadata](https://help.obsidian.md/Editing+and+formatting/Metadata) which is a YAML block at the top of the file. These values will be added to the document's metadata. (`ObsidianLoader` can also be passed a `collect_metadata=False` argument to disable this behavior.)"
] ]
}, },
{ {

View File

@ -1,4 +1,5 @@
"""Loader that loads Obsidian directory dump.""" """Loader that loads Obsidian directory dump."""
import re
from pathlib import Path from pathlib import Path
from typing import List from typing import List
@ -9,10 +10,38 @@ from langchain.document_loaders.base import BaseLoader
class ObsidianLoader(BaseLoader): class ObsidianLoader(BaseLoader):
"""Loader that loads Obsidian files from disk.""" """Loader that loads Obsidian files from disk."""
def __init__(self, path: str, encoding: str = "UTF-8"): FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)
def __init__(
self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True
):
"""Initialize with path.""" """Initialize with path."""
self.file_path = path self.file_path = path
self.encoding = encoding self.encoding = encoding
self.collect_metadata = collect_metadata
def _parse_front_matter(self, content: str) -> dict:
"""Parse front matter metadata from the content and return it as a dict."""
if not self.collect_metadata:
return {}
match = self.FRONT_MATTER_REGEX.search(content)
front_matter = {}
if match:
lines = match.group(1).split("\n")
for line in lines:
if ":" in line:
key, value = line.split(":", 1)
front_matter[key.strip()] = value.strip()
else:
# Skip lines without a colon
continue
return front_matter
def _remove_front_matter(self, content: str) -> str:
"""Remove front matter metadata from the given content."""
if not self.collect_metadata:
return content
return self.FRONT_MATTER_REGEX.sub("", content)
def load(self) -> List[Document]: def load(self) -> List[Document]:
"""Load documents.""" """Load documents."""
@ -21,6 +50,10 @@ class ObsidianLoader(BaseLoader):
for p in ps: for p in ps:
with open(p, encoding=self.encoding) as f: with open(p, encoding=self.encoding) as f:
text = f.read() text = f.read()
metadata = {"source": str(p)}
front_matter = self._parse_front_matter(text)
text = self._remove_front_matter(text)
metadata = {"source": str(p.name), "path": str(p), **front_matter}
docs.append(Document(page_content=text, metadata=metadata)) docs.append(Document(page_content=text, metadata=metadata))
return docs return docs