From 60dc19da3046e393751288949de9bef3729dda65 Mon Sep 17 00:00:00 2001 From: Rajendra Kadam Date: Thu, 19 Sep 2024 19:29:04 +0530 Subject: [PATCH] [community] Added PebbloTextLoader for loading text data in PebbloSafeLoader (#26582) - **Description:** Added PebbloTextLoader for loading text in PebbloSafeLoader. - Since PebbloSafeLoader wraps document loaders, this new loader enables direct loading of text into Documents using PebbloSafeLoader. - **Issue:** NA - **Dependencies:** NA - [x] **Tests**: Added/Updated tests --- .../document_loaders/__init__.py | 3 + .../document_loaders/pebblo.py | 66 ++++++++++++++++++- .../document_loaders/test_imports.py | 1 + .../document_loaders/test_pebblo.py | 44 +++++++++++++ 4 files changed, 113 insertions(+), 1 deletion(-) diff --git a/libs/community/langchain_community/document_loaders/__init__.py b/libs/community/langchain_community/document_loaders/__init__.py index eb059d6fbe..2576093d3d 100644 --- a/libs/community/langchain_community/document_loaders/__init__.py +++ b/libs/community/langchain_community/document_loaders/__init__.py @@ -359,6 +359,7 @@ if TYPE_CHECKING: ) from langchain_community.document_loaders.pebblo import ( PebbloSafeLoader, + PebbloTextLoader, ) from langchain_community.document_loaders.polars_dataframe import ( PolarsDataFrameLoader, @@ -650,6 +651,7 @@ _module_lookup = { "PDFPlumberLoader": "langchain_community.document_loaders.pdf", "PagedPDFSplitter": "langchain_community.document_loaders.pdf", "PebbloSafeLoader": "langchain_community.document_loaders.pebblo", + "PebbloTextLoader": "langchain_community.document_loaders.pebblo", "PlaywrightURLLoader": "langchain_community.document_loaders.url_playwright", "PolarsDataFrameLoader": "langchain_community.document_loaders.polars_dataframe", "PsychicLoader": "langchain_community.document_loaders.psychic", @@ -855,6 +857,7 @@ __all__ = [ "PDFPlumberLoader", "PagedPDFSplitter", "PebbloSafeLoader", + "PebbloTextLoader", "PlaywrightURLLoader", "PolarsDataFrameLoader", "PsychicLoader", diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py index bcf1cde050..8d3f54e342 100644 --- a/libs/community/langchain_community/document_loaders/pebblo.py +++ b/libs/community/langchain_community/document_loaders/pebblo.py @@ -4,7 +4,7 @@ import logging import os import uuid from importlib.metadata import version -from typing import Dict, Iterator, List, Optional +from typing import Any, Dict, Iterable, Iterator, List, Optional from langchain_core.documents import Document @@ -271,3 +271,67 @@ class PebbloSafeLoader(BaseLoader): doc_metadata["pb_checksum"] = classified_docs.get(doc.pb_id, {}).get( "pb_checksum", None ) + + +class PebbloTextLoader(BaseLoader): + """ + Loader for text data. + + Since PebbloSafeLoader is a wrapper around document loaders, this loader is + used to load text data directly into Documents. + """ + + def __init__( + self, + texts: Iterable[str], + *, + source: Optional[str] = None, + ids: Optional[List[str]] = None, + metadata: Optional[Dict[str, Any]] = None, + metadatas: Optional[List[Dict[str, Any]]] = None, + ) -> None: + """ + Args: + texts: Iterable of text data. + source: Source of the text data. + Optional. Defaults to None. + ids: List of unique identifiers for each text. + Optional. Defaults to None. + metadata: Metadata for all texts. + Optional. Defaults to None. + metadatas: List of metadata for each text. + Optional. Defaults to None. + """ + self.texts = texts + self.source = source + self.ids = ids + self.metadata = metadata + self.metadatas = metadatas + + def lazy_load(self) -> Iterator[Document]: + """ + Lazy load text data into Documents. + + Returns: + Iterator of Documents + """ + for i, text in enumerate(self.texts): + _id = None + metadata = self.metadata or {} + if self.metadatas and i < len(self.metadatas) and self.metadatas[i]: + metadata.update(self.metadatas[i]) + if self.ids and i < len(self.ids): + _id = self.ids[i] + yield Document(id=_id, page_content=text, metadata=metadata) + + def load(self) -> List[Document]: + """ + Load text data into Documents. + + Returns: + List of Documents + """ + documents = [] + for doc in self.lazy_load(): + documents.append(doc) + return documents diff --git a/libs/community/tests/unit_tests/document_loaders/test_imports.py b/libs/community/tests/unit_tests/document_loaders/test_imports.py index fbf624f537..b49a1b7cc4 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_imports.py +++ b/libs/community/tests/unit_tests/document_loaders/test_imports.py @@ -55,6 +55,7 @@ EXPECTED_ALL = [ "DedocFileLoader", "DedocPDFLoader", "PebbloSafeLoader", + "PebbloTextLoader", "DiffbotLoader", "DirectoryLoader", "DiscordChatLoader", diff --git a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py index 89617b9cd5..9d95bc98fd 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py +++ b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py @@ -25,6 +25,11 @@ def test_pebblo_import() -> None: from langchain_community.document_loaders import PebbloSafeLoader # noqa: F401 +def test_pebblo_text_loader_import() -> None: + """Test that the Pebblo text loader can be imported.""" + from langchain_community.document_loaders import PebbloTextLoader # noqa: F401 + + def test_empty_filebased_loader(mocker: MockerFixture) -> None: """Test basic file based csv loader.""" # Setup @@ -146,3 +151,42 @@ def test_pebblo_safe_loader_api_key() -> None: # Assert assert loader.pb_client.api_key == api_key assert loader.pb_client.classifier_location == "local" + + +def test_pebblo_text_loader(mocker: MockerFixture) -> None: + """ + Test loading in-memory text with PebbloTextLoader and PebbloSafeLoader. + """ + # Setup + from langchain_community.document_loaders import PebbloSafeLoader, PebbloTextLoader + + mocker.patch.multiple( + "requests", + get=MockResponse(json_data={"data": ""}, status_code=200), + post=MockResponse(json_data={"data": ""}, status_code=200), + ) + + text = "This is a test text." + source = "fake_source" + expected_docs = [ + Document( + metadata={ + "full_path": source, + "pb_checksum": None, + }, + page_content=text, + ), + ] + + # Exercise + texts = [text] + loader = PebbloSafeLoader( + PebbloTextLoader(texts, source=source), + "dummy_app_name", + "dummy_owner", + "dummy_description", + ) + result = loader.load() + + # Assert + assert result == expected_docs