From 57b226532d16b532c43dba2f42e9aaeb720944a9 Mon Sep 17 00:00:00 2001 From: Benito Geordie <89472452+benitoThree@users.noreply.github.com> Date: Tue, 16 Apr 2024 18:36:55 -0500 Subject: [PATCH] community[minor]: Added integrations for ThirdAI's NeuralDB as a Retriever (#17334) **Description:** Adds ThirdAI NeuralDB retriever integration. NeuralDB is a CPU-friendly and fine-tunable text retrieval engine. We previously added a vector store integration but we think that it will be easier for our customers if they can also find us under under langchain-community/retrievers. --------- Co-authored-by: kartikTAI <129414343+kartikTAI@users.noreply.github.com> Co-authored-by: Kartik Sarangmath --- .../retrievers/thirdai_neuraldb.ipynb | 148 ++++++++++ .../vectorstores/thirdai_neuraldb.ipynb | 14 +- .../retrievers/__init__.py | 1 + .../retrievers/thirdai_neuraldb.py | 260 ++++++++++++++++++ .../vectorstores/thirdai_neuraldb.py | 42 --- .../retrievers/test_thirdai_neuraldb.py | 58 ++++ .../vectorstores/test_thirdai_neuraldb.py | 8 - .../unit_tests/retrievers/test_imports.py | 1 + 8 files changed, 469 insertions(+), 63 deletions(-) create mode 100644 docs/docs/integrations/retrievers/thirdai_neuraldb.ipynb create mode 100644 libs/community/langchain_community/retrievers/thirdai_neuraldb.py create mode 100644 libs/community/tests/integration_tests/retrievers/test_thirdai_neuraldb.py diff --git a/docs/docs/integrations/retrievers/thirdai_neuraldb.ipynb b/docs/docs/integrations/retrievers/thirdai_neuraldb.ipynb new file mode 100644 index 0000000000..6b5b12e922 --- /dev/null +++ b/docs/docs/integrations/retrievers/thirdai_neuraldb.ipynb @@ -0,0 +1,148 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# **NeuralDB**\n", + "NeuralDB is a CPU-friendly and fine-tunable retrieval engine developed by ThirdAI.\n", + "\n", + "### **Initialization**\n", + "There are two initialization methods:\n", + "- From Scratch: Basic model\n", + "- From Checkpoint: Load a model that was previously saved\n", + "\n", + "For all of the following initialization methods, the `thirdai_key` parameter can be ommitted if the `THIRDAI_KEY` environment variable is set.\n", + "\n", + "ThirdAI API keys can be obtained at https://www.thirdai.com/try-bolt/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.retrievers import NeuralDBRetriever\n", + "\n", + "# From scratch\n", + "retriever = NeuralDBRetriever.from_scratch(thirdai_key=\"your-thirdai-key\")\n", + "\n", + "# From checkpoint\n", + "retriever = NeuralDBRetriever.from_checkpoint(\n", + " # Path to a NeuralDB checkpoint. For example, if you call\n", + " # retriever.save(\"/path/to/checkpoint.ndb\") in one script, then you can\n", + " # call NeuralDBRetriever.from_checkpoint(\"/path/to/checkpoint.ndb\") in\n", + " # another script to load the saved model.\n", + " checkpoint=\"/path/to/checkpoint.ndb\",\n", + " thirdai_key=\"your-thirdai-key\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### **Inserting document sources**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "retriever.insert(\n", + " # If you have PDF, DOCX, or CSV files, you can directly pass the paths to the documents\n", + " sources=[\"/path/to/doc.pdf\", \"/path/to/doc.docx\", \"/path/to/doc.csv\"],\n", + " # When True this means that the underlying model in the NeuralDB will\n", + " # undergo unsupervised pretraining on the inserted files. Defaults to True.\n", + " train=True,\n", + " # Much faster insertion with a slight drop in performance. Defaults to True.\n", + " fast_mode=True,\n", + ")\n", + "\n", + "from thirdai import neural_db as ndb\n", + "\n", + "retriever.insert(\n", + " # If you have files in other formats, or prefer to configure how\n", + " # your files are parsed, then you can pass in NeuralDB document objects\n", + " # like this.\n", + " sources=[\n", + " ndb.PDF(\n", + " \"/path/to/doc.pdf\",\n", + " version=\"v2\",\n", + " chunk_size=100,\n", + " metadata={\"published\": 2022},\n", + " ),\n", + " ndb.Unstructured(\"/path/to/deck.pptx\"),\n", + " ]\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### **Retrieving documents**\n", + "To query the retriever, you can use the standard LangChain retriever method `get_relevant_documents`, which returns a list of LangChain Document objects. Each document object represents a chunk of text from the indexed files. For example, it may contain a paragraph from one of the indexed PDF files. In addition to the text, the document's metadata field contains information such as the document's ID, the source of this document (which file it came from), and the score of the document." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# This returns a list of LangChain Document objects\n", + "documents = retriever.get_relevant_documents(\"query\", top_k=10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### **Fine tuning**\n", + "NeuralDBRetriever can be fine-tuned to user behavior and domain-specific knowledge. It can be fine-tuned in two ways:\n", + "1. Association: the retriever associates a source phrase with a target phrase. When the retriever sees the source phrase, it will also consider results that are relevant to the target phrase.\n", + "2. Upvoting: the retriever upweights the score of a document for a specific query. This is useful when you want to fine-tune the retriever to user behavior. For example, if a user searches \"how is a car manufactured\" and likes the returned document with id 52, then we can upvote the document with id 52 for the query \"how is a car manufactured\"." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "retriever.associate(source=\"source phrase\", target=\"target phrase\")\n", + "retriever.associate_batch(\n", + " [\n", + " (\"source phrase 1\", \"target phrase 1\"),\n", + " (\"source phrase 2\", \"target phrase 2\"),\n", + " ]\n", + ")\n", + "\n", + "retriever.upvote(query=\"how is a car manufactured\", document_id=52)\n", + "retriever.upvote_batch(\n", + " [\n", + " (\"query 1\", 52),\n", + " (\"query 2\", 20),\n", + " ]\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "langchain", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/docs/integrations/vectorstores/thirdai_neuraldb.ipynb b/docs/docs/integrations/vectorstores/thirdai_neuraldb.ipynb index 137e501897..4eb8522760 100644 --- a/docs/docs/integrations/vectorstores/thirdai_neuraldb.ipynb +++ b/docs/docs/integrations/vectorstores/thirdai_neuraldb.ipynb @@ -10,9 +10,8 @@ "\n", "## Initialization\n", "\n", - "There are three initialization methods:\n", + "There are two initialization methods:\n", "- From Scratch: Basic model\n", - "- From Bazaar: Download a pretrained base model from our model bazaar for better performance\n", "- From Checkpoint: Load a model that was previously saved\n", "\n", "For all of the following initialization methods, the `thirdai_key` parameter can be omitted if the `THIRDAI_KEY` environment variable is set.\n", @@ -31,17 +30,6 @@ "# From scratch\n", "vectorstore = NeuralDBVectorStore.from_scratch(thirdai_key=\"your-thirdai-key\")\n", "\n", - "# From bazaar\n", - "vectorstore = NeuralDBVectorStore.from_bazaar(\n", - " # Name of base model to be downloaded from model bazaar.\n", - " # \"General QnA\" gives better performance on question-answering.\n", - " base=\"General QnA\",\n", - " # Path to a directory that caches models to prevent repeated downloading.\n", - " # Defaults to {CWD}/model_bazaar\n", - " bazaar_cache=\"/path/to/bazaar_cache\",\n", - " thirdai_key=\"your-thirdai-key\",\n", - ")\n", - "\n", "# From checkpoint\n", "vectorstore = NeuralDBVectorStore.from_checkpoint(\n", " # Path to a NeuralDB checkpoint. For example, if you call\n", diff --git a/libs/community/langchain_community/retrievers/__init__.py b/libs/community/langchain_community/retrievers/__init__.py index 7034d81552..32f5fc13b7 100644 --- a/libs/community/langchain_community/retrievers/__init__.py +++ b/libs/community/langchain_community/retrievers/__init__.py @@ -212,6 +212,7 @@ _module_lookup = { "YouRetriever": "langchain_community.retrievers.you", "ZepRetriever": "langchain_community.retrievers.zep", "ZillizRetriever": "langchain_community.retrievers.zilliz", + "NeuralDBRetriever": "langchain_community.retrievers.thirdai_neuraldb", } diff --git a/libs/community/langchain_community/retrievers/thirdai_neuraldb.py b/libs/community/langchain_community/retrievers/thirdai_neuraldb.py new file mode 100644 index 0000000000..9b436b3e5a --- /dev/null +++ b/libs/community/langchain_community/retrievers/thirdai_neuraldb.py @@ -0,0 +1,260 @@ +from __future__ import annotations + +import importlib +import os +from pathlib import Path +from typing import Any, Dict, List, Optional, Tuple, Union + +from langchain_core.callbacks import CallbackManagerForRetrieverRun +from langchain_core.documents import Document +from langchain_core.pydantic_v1 import Extra, SecretStr, root_validator +from langchain_core.retrievers import BaseRetriever +from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env + + +class NeuralDBRetriever(BaseRetriever): + """Document retriever that uses ThirdAI's NeuralDB.""" + + thirdai_key: SecretStr + """ThirdAI API Key""" + + db: Any = None #: :meta private: + """NeuralDB instance""" + + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid + underscore_attrs_are_private = True + + @staticmethod + def _verify_thirdai_library(thirdai_key: Optional[str] = None) -> None: + try: + from thirdai import licensing + + importlib.util.find_spec("thirdai.neural_db") + + licensing.activate(thirdai_key or os.getenv("THIRDAI_KEY")) + except ImportError: + raise ModuleNotFoundError( + "Could not import thirdai python package and neuraldb dependencies. " + "Please install it with `pip install thirdai[neural_db]`." + ) + + @classmethod + def from_scratch( + cls, + thirdai_key: Optional[str] = None, + **model_kwargs: dict, + ) -> NeuralDBRetriever: + """ + Create a NeuralDBRetriever from scratch. + + To use, set the ``THIRDAI_KEY`` environment variable with your ThirdAI + API key, or pass ``thirdai_key`` as a named parameter. + + Example: + .. code-block:: python + + from langchain_community.retrievers import NeuralDBRetriever + + retriever = NeuralDBRetriever.from_scratch( + thirdai_key="your-thirdai-key", + ) + + retriever.insert([ + "/path/to/doc.pdf", + "/path/to/doc.docx", + "/path/to/doc.csv", + ]) + + documents = retriever.get_relevant_documents("AI-driven music therapy") + """ + NeuralDBRetriever._verify_thirdai_library(thirdai_key) + from thirdai import neural_db as ndb + + return cls(thirdai_key=thirdai_key, db=ndb.NeuralDB(**model_kwargs)) + + @classmethod + def from_checkpoint( + cls, + checkpoint: Union[str, Path], + thirdai_key: Optional[str] = None, + ) -> NeuralDBRetriever: + """ + Create a NeuralDBRetriever with a base model from a saved checkpoint + + To use, set the ``THIRDAI_KEY`` environment variable with your ThirdAI + API key, or pass ``thirdai_key`` as a named parameter. + + Example: + .. code-block:: python + + from langchain_community.retrievers import NeuralDBRetriever + + retriever = NeuralDBRetriever.from_checkpoint( + checkpoint="/path/to/checkpoint.ndb", + thirdai_key="your-thirdai-key", + ) + + retriever.insert([ + "/path/to/doc.pdf", + "/path/to/doc.docx", + "/path/to/doc.csv", + ]) + + documents = retriever.get_relevant_documents("AI-driven music therapy") + """ + NeuralDBRetriever._verify_thirdai_library(thirdai_key) + from thirdai import neural_db as ndb + + return cls(thirdai_key=thirdai_key, db=ndb.NeuralDB.from_checkpoint(checkpoint)) + + @root_validator() + def validate_environments(cls, values: Dict) -> Dict: + """Validate ThirdAI environment variables.""" + values["thirdai_key"] = convert_to_secret_str( + get_from_dict_or_env( + values, + "thirdai_key", + "THIRDAI_KEY", + ) + ) + return values + + def insert( + self, + sources: List[Any], + train: bool = True, + fast_mode: bool = True, + **kwargs: dict, + ) -> None: + """Inserts files / document sources into the retriever. + + Args: + train: When True this means that the underlying model in the + NeuralDB will undergo unsupervised pretraining on the inserted files. + Defaults to True. + fast_mode: Much faster insertion with a slight drop in performance. + Defaults to True. + """ + sources = self._preprocess_sources(sources) + self.db.insert( + sources=sources, + train=train, + fast_approximation=fast_mode, + **kwargs, + ) + + def _preprocess_sources(self, sources: list) -> list: + """Checks if the provided sources are string paths. If they are, convert + to NeuralDB document objects. + + Args: + sources: list of either string paths to PDF, DOCX or CSV files, or + NeuralDB document objects. + """ + from thirdai import neural_db as ndb + + if not sources: + return sources + preprocessed_sources = [] + for doc in sources: + if not isinstance(doc, str): + preprocessed_sources.append(doc) + else: + if doc.lower().endswith(".pdf"): + preprocessed_sources.append(ndb.PDF(doc)) + elif doc.lower().endswith(".docx"): + preprocessed_sources.append(ndb.DOCX(doc)) + elif doc.lower().endswith(".csv"): + preprocessed_sources.append(ndb.CSV(doc)) + else: + raise RuntimeError( + f"Could not automatically load {doc}. Only files " + "with .pdf, .docx, or .csv extensions can be loaded " + "automatically. For other formats, please use the " + "appropriate document object from the ThirdAI library." + ) + return preprocessed_sources + + def upvote(self, query: str, document_id: int) -> None: + """The retriever upweights the score of a document for a specific query. + This is useful for fine-tuning the retriever to user behavior. + + Args: + query: text to associate with `document_id` + document_id: id of the document to associate query with. + """ + self.db.text_to_result(query, document_id) + + def upvote_batch(self, query_id_pairs: List[Tuple[str, int]]) -> None: + """Given a batch of (query, document id) pairs, the retriever upweights + the scores of the document for the corresponding queries. + This is useful for fine-tuning the retriever to user behavior. + + Args: + query_id_pairs: list of (query, document id) pairs. For each pair in + this list, the model will upweight the document id for the query. + """ + self.db.text_to_result_batch(query_id_pairs) + + def associate(self, source: str, target: str) -> None: + """The retriever associates a source phrase with a target phrase. + When the retriever sees the source phrase, it will also consider results + that are relevant to the target phrase. + + Args: + source: text to associate to `target`. + target: text to associate `source` to. + """ + self.db.associate(source, target) + + def associate_batch(self, text_pairs: List[Tuple[str, str]]) -> None: + """Given a batch of (source, target) pairs, the retriever associates + each source phrase with the corresponding target phrase. + + Args: + text_pairs: list of (source, target) text pairs. For each pair in + this list, the source will be associated with the target. + """ + self.db.associate_batch(text_pairs) + + def _get_relevant_documents( + self, query: str, run_manager: CallbackManagerForRetrieverRun, **kwargs: Any + ) -> List[Document]: + """Retrieve {top_k} contexts with your retriever for a given query + + Args: + query: Query to submit to the model + top_k: The max number of context results to retrieve. Defaults to 10. + """ + try: + if "top_k" not in kwargs: + kwargs["top_k"] = 10 + references = self.db.search(query=query, **kwargs) + return [ + Document( + page_content=ref.text, + metadata={ + "id": ref.id, + "upvote_ids": ref.upvote_ids, + "source": ref.source, + "metadata": ref.metadata, + "score": ref.score, + "context": ref.context(1), + }, + ) + for ref in references + ] + except Exception as e: + raise ValueError(f"Error while retrieving documents: {e}") from e + + def save(self, path: str) -> None: + """Saves a NeuralDB instance to disk. Can be loaded into memory by + calling NeuralDB.from_checkpoint(path) + + Args: + path: path on disk to save the NeuralDB instance to. + """ + self.db.save(path) diff --git a/libs/community/langchain_community/vectorstores/thirdai_neuraldb.py b/libs/community/langchain_community/vectorstores/thirdai_neuraldb.py index 25ab3f70ab..beece9ce3a 100644 --- a/libs/community/langchain_community/vectorstores/thirdai_neuraldb.py +++ b/libs/community/langchain_community/vectorstores/thirdai_neuraldb.py @@ -86,48 +86,6 @@ class NeuralDBVectorStore(VectorStore): return cls(db=ndb.NeuralDB(**model_kwargs)) # type: ignore[call-arg] - @classmethod - def from_bazaar( # type: ignore[no-untyped-def] - cls, - base: str, - bazaar_cache: Optional[str] = None, - thirdai_key: Optional[str] = None, - ): - """ - Create a NeuralDBVectorStore with a base model from the ThirdAI - model bazaar. - - To use, set the ``THIRDAI_KEY`` environment variable with your ThirdAI - API key, or pass ``thirdai_key`` as a named parameter. - - Example: - .. code-block:: python - - from langchain_community.vectorstores import NeuralDBVectorStore - - vectorstore = NeuralDBVectorStore.from_bazaar( - base="General QnA", - thirdai_key="your-thirdai-key", - ) - - vectorstore.insert([ - "/path/to/doc.pdf", - "/path/to/doc.docx", - "/path/to/doc.csv", - ]) - - documents = vectorstore.similarity_search("AI-driven music therapy") - """ - NeuralDBVectorStore._verify_thirdai_library(thirdai_key) - from thirdai import neural_db as ndb - - cache = bazaar_cache or str(Path(os.getcwd()) / "model_bazaar") - if not os.path.exists(cache): - os.mkdir(cache) - model_bazaar = ndb.Bazaar(cache) - model_bazaar.fetch() - return cls(db=model_bazaar.get_model(base)) # type: ignore[call-arg] - @classmethod def from_checkpoint( # type: ignore[no-untyped-def] cls, diff --git a/libs/community/tests/integration_tests/retrievers/test_thirdai_neuraldb.py b/libs/community/tests/integration_tests/retrievers/test_thirdai_neuraldb.py new file mode 100644 index 0000000000..b8d384f9af --- /dev/null +++ b/libs/community/tests/integration_tests/retrievers/test_thirdai_neuraldb.py @@ -0,0 +1,58 @@ +import os +import shutil +from typing import Generator + +import pytest + +from langchain_community.retrievers import NeuralDBRetriever + + +@pytest.fixture(scope="session") +def test_csv() -> Generator[str, None, None]: + csv = "thirdai-test.csv" + with open(csv, "w") as o: + o.write("column_1,column_2\n") + o.write("column one,column two\n") + yield csv + os.remove(csv) + + +def assert_result_correctness(documents: list) -> None: + assert len(documents) == 1 + assert documents[0].page_content == "column_1: column one\n\ncolumn_2: column two" + + +@pytest.mark.requires("thirdai[neural_db]") +def test_neuraldb_retriever_from_scratch(test_csv: str) -> None: + retriever = NeuralDBRetriever.from_scratch() + retriever.insert([test_csv]) + documents = retriever.get_relevant_documents("column") + assert_result_correctness(documents) + + +@pytest.mark.requires("thirdai[neural_db]") +def test_neuraldb_retriever_from_checkpoint(test_csv: str) -> None: + checkpoint = "thirdai-test-save.ndb" + if os.path.exists(checkpoint): + shutil.rmtree(checkpoint) + try: + retriever = NeuralDBRetriever.from_scratch() + retriever.insert([test_csv]) + retriever.save(checkpoint) + loaded_retriever = NeuralDBRetriever.from_checkpoint(checkpoint) + documents = loaded_retriever.get_relevant_documents("column") + assert_result_correctness(documents) + finally: + if os.path.exists(checkpoint): + shutil.rmtree(checkpoint) + + +@pytest.mark.requires("thirdai[neural_db]") +def test_neuraldb_retriever_other_methods(test_csv: str) -> None: + retriever = NeuralDBRetriever.from_scratch() + retriever.insert([test_csv]) + # Make sure they don't throw an error. + retriever.associate("A", "B") + retriever.associate_batch([("A", "B"), ("C", "D")]) + retriever.upvote("A", 0) + retriever.upvote_batch([("A", 0), ("B", 0)]) diff --git a/libs/community/tests/integration_tests/vectorstores/test_thirdai_neuraldb.py b/libs/community/tests/integration_tests/vectorstores/test_thirdai_neuraldb.py index 370e8ff54f..f75a196e64 100644 --- a/libs/community/tests/integration_tests/vectorstores/test_thirdai_neuraldb.py +++ b/libs/community/tests/integration_tests/vectorstores/test_thirdai_neuraldb.py @@ -46,14 +46,6 @@ def test_neuraldb_retriever_from_checkpoint(test_csv): # type: ignore[no-untype shutil.rmtree(checkpoint) -@pytest.mark.requires("thirdai[neural_db]") -def test_neuraldb_retriever_from_bazaar(test_csv): # type: ignore[no-untyped-def] - retriever = NeuralDBVectorStore.from_bazaar("General QnA") - retriever.insert([test_csv]) - documents = retriever.similarity_search("column") - assert_result_correctness(documents) - - @pytest.mark.requires("thirdai[neural_db]") def test_neuraldb_retriever_other_methods(test_csv): # type: ignore[no-untyped-def] retriever = NeuralDBVectorStore.from_scratch() diff --git a/libs/community/tests/unit_tests/retrievers/test_imports.py b/libs/community/tests/unit_tests/retrievers/test_imports.py index 9fa6b4ba04..b2897f30f5 100644 --- a/libs/community/tests/unit_tests/retrievers/test_imports.py +++ b/libs/community/tests/unit_tests/retrievers/test_imports.py @@ -40,6 +40,7 @@ EXPECTED_ALL = [ "ZepRetriever", "ZillizRetriever", "DocArrayRetriever", + "NeuralDBRetriever", ]