langchain/libs/community/langchain_community/document_loaders/acreom.py

import re
from pathlib import Path
from typing import Iterator, Union

from langchain_core.documents import Document

from langchain_community.document_loaders.base import BaseLoader


class AcreomLoader(BaseLoader):
    """Load `acreom` vault from a directory."""

    FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE | re.DOTALL)
    """Regex to match front matter metadata in markdown files."""

    def __init__(
        self,
        path: Union[str, Path],
        encoding: str = "UTF-8",
        collect_metadata: bool = True,
    ):
        """Initialize the loader."""
        self.file_path = path
        """Path to the directory containing the markdown files."""
        self.encoding = encoding
        """Encoding to use when reading the files."""
        self.collect_metadata = collect_metadata
        """Whether to collect metadata from the front matter."""

    def _parse_front_matter(self, content: str) -> dict:
        """Parse front matter metadata from the content and return it as a dict."""
        if not self.collect_metadata:
            return {}
        match = self.FRONT_MATTER_REGEX.search(content)
        front_matter = {}
        if match:
            lines = match.group(1).split("\n")
            for line in lines:
                if ":" in line:
                    key, value = line.split(":", 1)
                    front_matter[key.strip()] = value.strip()
                else:
                    # Skip lines without a colon
                    continue
        return front_matter

    def _remove_front_matter(self, content: str) -> str:
        """Remove front matter metadata from the given content."""
        if not self.collect_metadata:
            return content
        return self.FRONT_MATTER_REGEX.sub("", content)

    def _process_acreom_content(self, content: str) -> str:
        # remove acreom specific elements from content that
        # do not contribute to the context of current document
        content = re.sub(r"\s*-\s\[\s\]\s.*|\s*\[\s\]\s.*", "", content)  # rm tasks
        content = re.sub(r"#", "", content)  # rm hashtags
        content = re.sub(r"\[\[.*?\]\]", "", content)  # rm doclinks
        return content

    def lazy_load(self) -> Iterator[Document]:
        ps = list(Path(self.file_path).glob("**/*.md"))

        for p in ps:
            with open(p, encoding=self.encoding) as f:
                text = f.read()

            front_matter = self._parse_front_matter(text)
            text = self._remove_front_matter(text)

            text = self._process_acreom_content(text)

            metadata = {
                "source": str(p.name),
                "path": str(p),
                **front_matter,
            }

            yield Document(page_content=text, metadata=metadata)
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`import re`
			`from pathlib import Path`
community: better support of pathlib paths in document loaders (#18396) So this arose from the https://github.com/langchain-ai/langchain/pull/18397 problem of document loaders not supporting `pathlib.Path`. This pull request provides more uniform support for Path as an argument. The core ideas for this upgrade: - if there is a local file path used as an argument, it should be supported as `pathlib.Path` - if there are some external calls that may or may not support Pathlib, the argument is immidiately converted to `str` - if there `self.file_path` is used in a way that it allows for it to stay pathlib without conversion, is is only converted for the metadata. Twitter handle: https://twitter.com/mwmajewsk 2024-03-26 15:51:52 +00:00			`from typing import Iterator, Union`
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00
			`from langchain_core.documents import Document`

			`from langchain_community.document_loaders.base import BaseLoader`


			`class AcreomLoader(BaseLoader):`
			"""Load `acreom` vault from a directory."""

			`FRONT_MATTER_REGEX = re.compile(r"^---\n(.*?)\n---\n", re.MULTILINE \| re.DOTALL)`
			`"""Regex to match front matter metadata in markdown files."""`

			`def __init__(`
community: better support of pathlib paths in document loaders (#18396) So this arose from the https://github.com/langchain-ai/langchain/pull/18397 problem of document loaders not supporting `pathlib.Path`. This pull request provides more uniform support for Path as an argument. The core ideas for this upgrade: - if there is a local file path used as an argument, it should be supported as `pathlib.Path` - if there are some external calls that may or may not support Pathlib, the argument is immidiately converted to `str` - if there `self.file_path` is used in a way that it allows for it to stay pathlib without conversion, is is only converted for the metadata. Twitter handle: https://twitter.com/mwmajewsk 2024-03-26 15:51:52 +00:00			`self,`
			`path: Union[str, Path],`
			`encoding: str = "UTF-8",`
			`collect_metadata: bool = True,`
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`):`
			`"""Initialize the loader."""`
			`self.file_path = path`
			`"""Path to the directory containing the markdown files."""`
			`self.encoding = encoding`
			`"""Encoding to use when reading the files."""`
			`self.collect_metadata = collect_metadata`
			`"""Whether to collect metadata from the front matter."""`

			`def _parse_front_matter(self, content: str) -> dict:`
			`"""Parse front matter metadata from the content and return it as a dict."""`
			`if not self.collect_metadata:`
			`return {}`
			`match = self.FRONT_MATTER_REGEX.search(content)`
			`front_matter = {}`
			`if match:`
			`lines = match.group(1).split("\n")`
			`for line in lines:`
			`if ":" in line:`
			`key, value = line.split(":", 1)`
			`front_matter[key.strip()] = value.strip()`
			`else:`
			`# Skip lines without a colon`
			`continue`
			`return front_matter`

			`def _remove_front_matter(self, content: str) -> str:`
			`"""Remove front matter metadata from the given content."""`
			`if not self.collect_metadata:`
			`return content`
			`return self.FRONT_MATTER_REGEX.sub("", content)`

			`def _process_acreom_content(self, content: str) -> str:`
			`# remove acreom specific elements from content that`
			`# do not contribute to the context of current document`
			`content = re.sub(r"\s-\s\[\s\]\s.\|\s\[\s\]\s.", "", content) # rm tasks`
			`content = re.sub(r"#", "", content) # rm hashtags`
			`content = re.sub(r"\[\[.*?\]\]", "", content) # rm doclinks`
			`return content`

			`def lazy_load(self) -> Iterator[Document]:`
			`ps = list(Path(self.file_path).glob("*/.md"))`

			`for p in ps:`
			`with open(p, encoding=self.encoding) as f:`
			`text = f.read()`

			`front_matter = self._parse_front_matter(text)`
			`text = self._remove_front_matter(text)`

			`text = self._process_acreom_content(text)`

			`metadata = {`
			`"source": str(p.name),`
			`"path": str(p),`
			`**front_matter,`
			`}`

			`yield Document(page_content=text, metadata=metadata)`