import re from pathlib import Path from typing import Iterator, List from langchain_core.documents import Document from langchain_community.document_loaders.base import BaseLoader class AcreomLoader(BaseLoader): """Load `acreom` vault from a directory.""" FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL) """Regex to match front matter metadata in markdown files.""" def __init__( self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True ): """Initialize the loader.""" self.file_path = path """Path to the directory containing the markdown files.""" self.encoding = encoding """Encoding to use when reading the files.""" self.collect_metadata = collect_metadata """Whether to collect metadata from the front matter.""" def _parse_front_matter(self, content: str) -> dict: """Parse front matter metadata from the content and return it as a dict.""" if not self.collect_metadata: return {} match = self.FRONT_MATTER_REGEX.search(content) front_matter = {} if match: lines = match.group(1).split("\n") for line in lines: if ":" in line: key, value = line.split(":", 1) front_matter[key.strip()] = value.strip() else: # Skip lines without a colon continue return front_matter def _remove_front_matter(self, content: str) -> str: """Remove front matter metadata from the given content.""" if not self.collect_metadata: return content return self.FRONT_MATTER_REGEX.sub("", content) def _process_acreom_content(self, content: str) -> str: # remove acreom specific elements from content that # do not contribute to the context of current document content = re.sub(r"\s*-\s\[\s\]\s.*|\s*\[\s\]\s.*", "", content) # rm tasks content = re.sub(r"#", "", content) # rm hashtags content = re.sub(r"\[\[.*?\]\]", "", content) # rm doclinks return content def lazy_load(self) -> Iterator[Document]: ps = list(Path(self.file_path).glob("**/*.md")) for p in ps: with open(p, encoding=self.encoding) as f: text = f.read() front_matter = self._parse_front_matter(text) text = self._remove_front_matter(text) text = self._process_acreom_content(text) metadata = { "source": str(p.name), "path": str(p), **front_matter, } yield Document(page_content=text, metadata=metadata) def load(self) -> List[Document]: return list(self.lazy_load())