From a9c24503309e2e3eb800f335e0fbc7c22531bda0 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Wed, 3 May 2023 23:14:39 -0700 Subject: [PATCH] Harrison/toml loader (#4090) Co-authored-by: Mika Ayenson --- .../examples/directory_loader.ipynb | 4 +- .../examples/example_data/fake_rule.toml | 22 +++++ .../document_loaders/examples/toml.ipynb | 94 +++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/toml.py | 47 ++++++++++ 5 files changed, 167 insertions(+), 2 deletions(-) create mode 100644 docs/modules/indexes/document_loaders/examples/example_data/fake_rule.toml create mode 100644 docs/modules/indexes/document_loaders/examples/toml.ipynb create mode 100644 langchain/document_loaders/toml.py diff --git a/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb b/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb index b2bee4d9..6d6afacb 100644 --- a/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb +++ b/docs/modules/indexes/document_loaders/examples/directory_loader.ipynb @@ -233,7 +233,7 @@ { "cell_type": "code", "execution_count": null, - "id": "7f6e0eae", + "id": "6a91a0bc", "metadata": {}, "outputs": [], "source": [] @@ -255,7 +255,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/modules/indexes/document_loaders/examples/example_data/fake_rule.toml b/docs/modules/indexes/document_loaders/examples/example_data/fake_rule.toml new file mode 100644 index 00000000..df564383 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/example_data/fake_rule.toml @@ -0,0 +1,22 @@ +[internal] +creation_date = "2023-05-01" +updated_date = "2022-05-01" +release = ["release_type"] +min_endpoint_version = "some_semantic_version" +os_list = ["operating_system_list"] + +[rule] +uuid = "some_uuid" +name = "Fake Rule Name" +description = "Fake description of rule" +query = ''' +process where process.name : "somequery" +''' + +[[rule.threat]] +framework = "MITRE ATT&CK" + + [rule.threat.tactic] + name = "Execution" + id = "TA0002" + reference = "https://attack.mitre.org/tactics/TA0002/" diff --git a/docs/modules/indexes/document_loaders/examples/toml.ipynb b/docs/modules/indexes/document_loaders/examples/toml.ipynb new file mode 100644 index 00000000..e5931042 --- /dev/null +++ b/docs/modules/indexes/document_loaders/examples/toml.ipynb @@ -0,0 +1,94 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4284970b", + "metadata": {}, + "source": [ + "# TOML Loader\n", + "\n", + "If you need to load Toml files, use the `TomlLoader`." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "202fc42d", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import TomlLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "7ecae98c", + "metadata": {}, + "outputs": [], + "source": [ + "loader = TomlLoader('example_data/fake_rule.toml')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "eb08c26e", + "metadata": {}, + "outputs": [], + "source": [ + "rule = loader.load()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "405d36bc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='{\"internal\": {\"creation_date\": \"2023-05-01\", \"updated_date\": \"2022-05-01\", \"release\": [\"release_type\"], \"min_endpoint_version\": \"some_semantic_version\", \"os_list\": [\"operating_system_list\"]}, \"rule\": {\"uuid\": \"some_uuid\", \"name\": \"Fake Rule Name\", \"description\": \"Fake description of rule\", \"query\": \"process where process.name : \\\\\"somequery\\\\\"\\\\n\", \"threat\": [{\"framework\": \"MITRE ATT&CK\", \"tactic\": {\"name\": \"Execution\", \"id\": \"TA0002\", \"reference\": \"https://attack.mitre.org/tactics/TA0002/\"}}]}}', metadata={'source': 'example_data/fake_rule.toml'})]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "rule" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a896454d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 2d8c867d..c4f0c8f3 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -78,6 +78,7 @@ from langchain.document_loaders.srt import SRTLoader from langchain.document_loaders.stripe import StripeLoader from langchain.document_loaders.telegram import TelegramChatLoader from langchain.document_loaders.text import TextLoader +from langchain.document_loaders.toml import TomlLoader from langchain.document_loaders.twitter import TwitterTweetLoader from langchain.document_loaders.unstructured import ( UnstructuredAPIFileIOLoader, @@ -169,6 +170,7 @@ __all__ = [ "SlackDirectoryLoader", "TelegramChatLoader", "TextLoader", + "TomlLoader", "TwitterTweetLoader", "UnstructuredEPubLoader", "UnstructuredEmailLoader", diff --git a/langchain/document_loaders/toml.py b/langchain/document_loaders/toml.py new file mode 100644 index 00000000..1a36eb6e --- /dev/null +++ b/langchain/document_loaders/toml.py @@ -0,0 +1,47 @@ +import json +from pathlib import Path +from typing import Iterable, List, Union + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader + + +class TomlLoader(BaseLoader): + """ + A TOML document loader that inherits from the BaseLoader class. + + This class can be initialized with either a single source file or a source + directory containing TOML files. + """ + + def __init__(self, source: Union[str, Path]): + """Initialize the TomlLoader with a source file or directory.""" + self.source = Path(source) + + def load(self) -> List[Document]: + """Load and return all documents.""" + return list(self.lazy_load()) + + def lazy_load(self) -> Iterable[Document]: + """Lazily load the TOML documents from the source file or directory.""" + import tomli + + if self.source.is_file() and self.source.suffix == ".toml": + files = [self.source] + elif self.source.is_dir(): + files = list(self.source.glob("**/*.toml")) + else: + raise ValueError("Invalid source path or file type") + + for file_path in files: + with file_path.open("r", encoding="utf-8") as file: + content = file.read() + try: + data = tomli.loads(content) + doc = Document( + page_content=json.dumps(data), + metadata={"source": str(file_path)}, + ) + yield doc + except tomli.TOMLDecodeError as e: + print(f"Error parsing TOML file {file_path}: {e}")