diff --git a/docs/modules/indexes/document_loaders/examples/acreom.ipynb b/docs/modules/indexes/document_loaders/examples/acreom.ipynb new file mode 100644 index 00000000..65a814c9 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/acreom.ipynb @@ -0,0 +1,75 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e310c8dc-acd0-48d2-801c-f37ce99acd2d", + "metadata": {}, + "source": [ + "# acreom" + ] + }, + { + "cell_type": "markdown", + "id": "04a2c95d-4114-431e-904a-32d79005c28b", + "metadata": {}, + "source": [ + "[acreom](https://acreom.com) is a dev-first knowledge base with tasks running on local markdown files.\n", + "\n", + "Below is an example on how to load a local acreom vault into Langchain. As the local vault in acreom is a folder of plain text .md files, the loader requires the path to the directory. \n", + "\n", + "Vault files may contain some metadata which is stored as a YAML header. These values will be added to the document’s metadata if `collect_metadata` is set to true. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0169bee5-aa7a-4ec7-b7e7-b3bb2e58f3bb", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import AcreomLoader" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c1b49ab3-616b-4149-bef5-7559d65d3d2b", + "metadata": {}, + "outputs": [], + "source": [ + "loader = AcreomLoader('', collect_metadata=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3127a018-9c1c-4886-8321-f5666d970a95", + "metadata": {}, + "outputs": [], + "source": [ + "docs = loader.load()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 17f764b1..71b03c2d 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -1,5 +1,6 @@ """All different types of document loaders.""" +from langchain.document_loaders.acreom import AcreomLoader from langchain.document_loaders.airbyte_json import AirbyteJSONLoader from langchain.document_loaders.airtable import AirtableLoader from langchain.document_loaders.apify_dataset import ApifyDatasetLoader @@ -136,6 +137,7 @@ PagedPDFSplitter = PyPDFLoader TelegramChatLoader = TelegramChatFileLoader __all__ = [ + "AcreomLoader", "AZLyricsLoader", "AirbyteJSONLoader", "AirtableLoader", diff --git a/langchain/document_loaders/acreom.py b/langchain/document_loaders/acreom.py new file mode 100644 index 00000000..1a31b94f --- /dev/null +++ b/langchain/document_loaders/acreom.py @@ -0,0 +1,73 @@ +"""Loader that loads acreom vault from a directory.""" +import re +from pathlib import Path +from typing import Iterator, List + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class AcreomLoader(BaseLoader): + FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL) + + def __init__( + self, path: str, encoding: str = "UTF-8", collect_metadata: bool = True + ): + """Initialize with path.""" + self.file_path = path + self.encoding = encoding + self.collect_metadata = collect_metadata + + def _parse_front_matter(self, content: str) -> dict: + """Parse front matter metadata from the content and return it as a dict.""" + if not self.collect_metadata: + return {} + match = self.FRONT_MATTER_REGEX.search(content) + front_matter = {} + if match: + lines = match.group(1).split("\n") + for line in lines: + if ":" in line: + key, value = line.split(":", 1) + front_matter[key.strip()] = value.strip() + else: + # Skip lines without a colon + continue + return front_matter + + def _remove_front_matter(self, content: str) -> str: + """Remove front matter metadata from the given content.""" + if not self.collect_metadata: + return content + return self.FRONT_MATTER_REGEX.sub("", content) + + def _process_acreom_content(self, content: str) -> str: + # remove acreom specific elements from content that + # do not contribute to the context of current document + content = re.sub("\s*-\s\[\s\]\s.*|\s*\[\s\]\s.*", "", content) # rm tasks + content = re.sub("#", "", content) # rm hashtags + content = re.sub("\[\[.*?\]\]", "", content) # rm doclinks + return content + + def lazy_load(self) -> Iterator[Document]: + ps = list(Path(self.file_path).glob("**/*.md")) + + for p in ps: + with open(p, encoding=self.encoding) as f: + text = f.read() + + front_matter = self._parse_front_matter(text) + text = self._remove_front_matter(text) + + text = self._process_acreom_content(text) + + metadata = { + "source": str(p.name), + "path": str(p), + **front_matter, + } + + yield Document(page_content=text, metadata=metadata) + + def load(self) -> List[Document]: + return list(self.lazy_load())