langchain/libs/community/langchain_community/vectorstores/thirdai_neuraldb.py

import importlib
import os
import tempfile
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union

from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import Extra, root_validator
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env
from langchain_core.vectorstores import VectorStore


class NeuralDBVectorStore(VectorStore):
    """Vectorstore that uses ThirdAI's NeuralDB.

    To use, you should have the ``thirdai[neural_db]`` python package installed.

    Example:
        .. code-block:: python

            from langchain_community.vectorstores import NeuralDBVectorStore
            from thirdai import neural_db as ndb

            db = ndb.NeuralDB()
            vectorstore = NeuralDBVectorStore(db=db)
    """

    def __init__(self, db: Any) -> None:
        self.db = db

    db: Any = None  #: :meta private:
    """NeuralDB instance"""

    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid
        underscore_attrs_are_private = True

    @staticmethod
    def _verify_thirdai_library(thirdai_key: Optional[str] = None):  # type: ignore[no-untyped-def]
        try:
            from thirdai import licensing

            importlib.util.find_spec("thirdai.neural_db")

            licensing.activate(thirdai_key or os.getenv("THIRDAI_KEY"))
        except ImportError:
            raise ModuleNotFoundError(
                "Could not import thirdai python package and neuraldb dependencies. "
                "Please install it with `pip install thirdai[neural_db]`."
            )

    @classmethod
    def from_scratch(  # type: ignore[no-untyped-def, no-untyped-def]
        cls,
        thirdai_key: Optional[str] = None,
        **model_kwargs,
    ):
        """
        Create a NeuralDBVectorStore from scratch.

        To use, set the ``THIRDAI_KEY`` environment variable with your ThirdAI
        API key, or pass ``thirdai_key`` as a named parameter.

        Example:
            .. code-block:: python

                from langchain_community.vectorstores import NeuralDBVectorStore

                vectorstore = NeuralDBVectorStore.from_scratch(
                    thirdai_key="your-thirdai-key",
                )

                vectorstore.insert([
                    "/path/to/doc.pdf",
                    "/path/to/doc.docx",
                    "/path/to/doc.csv",
                ])

                documents = vectorstore.similarity_search("AI-driven music therapy")
        """
        NeuralDBVectorStore._verify_thirdai_library(thirdai_key)
        from thirdai import neural_db as ndb

        return cls(db=ndb.NeuralDB(**model_kwargs))  # type: ignore[call-arg]

    @classmethod
    def from_checkpoint(  # type: ignore[no-untyped-def]
        cls,
        checkpoint: Union[str, Path],
        thirdai_key: Optional[str] = None,
    ):
        """
        Create a NeuralDBVectorStore with a base model from a saved checkpoint

        To use, set the ``THIRDAI_KEY`` environment variable with your ThirdAI
        API key, or pass ``thirdai_key`` as a named parameter.

        Example:
            .. code-block:: python

                from langchain_community.vectorstores import NeuralDBVectorStore

                vectorstore = NeuralDBVectorStore.from_checkpoint(
                    checkpoint="/path/to/checkpoint.ndb",
                    thirdai_key="your-thirdai-key",
                )

                vectorstore.insert([
                    "/path/to/doc.pdf",
                    "/path/to/doc.docx",
                    "/path/to/doc.csv",
                ])

                documents = vectorstore.similarity_search("AI-driven music therapy")
        """
        NeuralDBVectorStore._verify_thirdai_library(thirdai_key)
        from thirdai import neural_db as ndb

        return cls(db=ndb.NeuralDB.from_checkpoint(checkpoint))  # type: ignore[call-arg]

    @classmethod
    def from_texts(
        cls,
        texts: List[str],
        embedding: Embeddings,
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> "NeuralDBVectorStore":
        """Return VectorStore initialized from texts and embeddings."""
        model_kwargs = {}
        if "thirdai_key" in kwargs:
            model_kwargs["thirdai_key"] = kwargs["thirdai_key"]
            del kwargs["thirdai_key"]
        vectorstore = cls.from_scratch(**model_kwargs)
        vectorstore.add_texts(texts, metadatas, **kwargs)
        return vectorstore

    def add_texts(
        self,
        texts: Iterable[str],
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> List[str]:
        """Run more texts through the embeddings and add to the vectorstore.

        Args:
            texts: Iterable of strings to add to the vectorstore.
            metadatas: Optional list of metadatas associated with the texts.
            kwargs: vectorstore specific parameters

        Returns:
            List of ids from adding the texts into the vectorstore.
        """
        import pandas as pd
        from thirdai import neural_db as ndb

        df = pd.DataFrame({"texts": texts})
        if metadatas:
            df = pd.concat([df, pd.DataFrame.from_records(metadatas)], axis=1)
        temp = tempfile.NamedTemporaryFile("w", delete=False, delete_on_close=False)  # type: ignore[call-overload]
        df.to_csv(temp)
        source_id = self.insert([ndb.CSV(temp.name)], **kwargs)[0]
        offset = self.db._savable_state.documents.get_source_by_id(source_id)[1]
        return [str(offset + i) for i in range(len(texts))]  # type: ignore[arg-type]

    @root_validator()
    def validate_environments(cls, values: Dict) -> Dict:
        """Validate ThirdAI environment variables."""
        values["thirdai_key"] = convert_to_secret_str(
            get_from_dict_or_env(
                values,
                "thirdai_key",
                "THIRDAI_KEY",
            )
        )
        return values

    def insert(  # type: ignore[no-untyped-def, no-untyped-def]
        self,
        sources: List[Any],
        train: bool = True,
        fast_mode: bool = True,
        **kwargs,
    ):
        """Inserts files / document sources into the vectorstore.

        Args:
            train: When True this means that the underlying model in the
            NeuralDB will undergo unsupervised pretraining on the inserted files.
            Defaults to True.
            fast_mode: Much faster insertion with a slight drop in performance.
            Defaults to True.
        """
        sources = self._preprocess_sources(sources)
        self.db.insert(
            sources=sources,
            train=train,
            fast_approximation=fast_mode,
            **kwargs,
        )

    def _preprocess_sources(self, sources):  # type: ignore[no-untyped-def]
        """Checks if the provided sources are string paths. If they are, convert
        to NeuralDB document objects.

        Args:
            sources: list of either string paths to PDF, DOCX or CSV files, or
            NeuralDB document objects.
        """
        from thirdai import neural_db as ndb

        if not sources:
            return sources
        preprocessed_sources = []
        for doc in sources:
            if not isinstance(doc, str):
                preprocessed_sources.append(doc)
            else:
                if doc.lower().endswith(".pdf"):
                    preprocessed_sources.append(ndb.PDF(doc))
                elif doc.lower().endswith(".docx"):
                    preprocessed_sources.append(ndb.DOCX(doc))
                elif doc.lower().endswith(".csv"):
                    preprocessed_sources.append(ndb.CSV(doc))
                else:
                    raise RuntimeError(
                        f"Could not automatically load {doc}. Only files "
                        "with .pdf, .docx, or .csv extensions can be loaded "
                        "automatically. For other formats, please use the "
                        "appropriate document object from the ThirdAI library."
                    )
        return preprocessed_sources

    def upvote(self, query: str, document_id: Union[int, str]):  # type: ignore[no-untyped-def]
        """The vectorstore upweights the score of a document for a specific query.
        This is useful for fine-tuning the vectorstore to user behavior.

        Args:
            query: text to associate with `document_id`
            document_id: id of the document to associate query with.
        """
        self.db.text_to_result(query, int(document_id))

    def upvote_batch(self, query_id_pairs: List[Tuple[str, int]]):  # type: ignore[no-untyped-def]
        """Given a batch of (query, document id) pairs, the vectorstore upweights
        the scores of the document for the corresponding queries.
        This is useful for fine-tuning the vectorstore to user behavior.

        Args:
            query_id_pairs: list of (query, document id) pairs. For each pair in
            this list, the model will upweight the document id for the query.
        """
        self.db.text_to_result_batch(
            [(query, int(doc_id)) for query, doc_id in query_id_pairs]
        )

    def associate(self, source: str, target: str):  # type: ignore[no-untyped-def]
        """The vectorstore associates a source phrase with a target phrase.
        When the vectorstore sees the source phrase, it will also consider results
        that are relevant to the target phrase.

        Args:
            source: text to associate to `target`.
            target: text to associate `source` to.
        """
        self.db.associate(source, target)

    def associate_batch(self, text_pairs: List[Tuple[str, str]]):  # type: ignore[no-untyped-def]
        """Given a batch of (source, target) pairs, the vectorstore associates
        each source phrase with the corresponding target phrase.

        Args:
            text_pairs: list of (source, target) text pairs. For each pair in
            this list, the source will be associated with the target.
        """
        self.db.associate_batch(text_pairs)

    def similarity_search(
        self, query: str, k: int = 10, **kwargs: Any
    ) -> List[Document]:
        """Retrieve {k} contexts with for a given query

        Args:
            query: Query to submit to the model
            k: The max number of context results to retrieve. Defaults to 10.
        """
        try:
            references = self.db.search(query=query, top_k=k, **kwargs)
            return [
                Document(
                    page_content=ref.text,
                    metadata={
                        "id": ref.id,
                        "upvote_ids": ref.upvote_ids,
                        "source": ref.source,
                        "metadata": ref.metadata,
                        "score": ref.score,
                        "context": ref.context(1),
                    },
                )
                for ref in references
            ]
        except Exception as e:
            raise ValueError(f"Error while retrieving documents: {e}") from e

    def save(self, path: str):  # type: ignore[no-untyped-def]
        """Saves a NeuralDB instance to disk. Can be loaded into memory by
        calling NeuralDB.from_checkpoint(path)

        Args:
            path: path on disk to save the NeuralDB instance to.
        """
        self.db.save(path)
community: Added integrations for ThirdAI's NeuralDB with Retriever and VectorStore frameworks (#15280) Description: Adds ThirdAI NeuralDB retriever and vectorstore integration. NeuralDB is a CPU-friendly and fine-tunable text retrieval engine. 2024-01-29 16:35:42 +00:00			`import importlib`
			`import os`
			`import tempfile`
			`from pathlib import Path`
			`from typing import Any, Dict, Iterable, List, Optional, Tuple, Union`

			`from langchain_core.documents import Document`
			`from langchain_core.embeddings import Embeddings`
			`from langchain_core.pydantic_v1 import Extra, root_validator`
			`from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env`
			`from langchain_core.vectorstores import VectorStore`


			`class NeuralDBVectorStore(VectorStore):`
community: use NeuralDB object to initialize NeuralDBVectorStore (#17272) Description: This PR adds an `__init__` method to the NeuralDBVectorStore class, which takes in a NeuralDB object to instantiate the state of NeuralDBVectorStore. Issue: N/A Dependencies: N/A Twitter handle: N/A 2024-02-22 17:05:01 +00:00			`"""Vectorstore that uses ThirdAI's NeuralDB.`

			To use, you should have the ``thirdai[neural_db]`` python package installed.

			`Example:`
			`.. code-block:: python`

			`from langchain_community.vectorstores import NeuralDBVectorStore`
			`from thirdai import neural_db as ndb`

			`db = ndb.NeuralDB()`
			`vectorstore = NeuralDBVectorStore(db=db)`
			`"""`

community[patch]: fix lint (#17984) 2024-02-22 23:15:27 +00:00			`def __init__(self, db: Any) -> None:`
community: use NeuralDB object to initialize NeuralDBVectorStore (#17272) Description: This PR adds an `__init__` method to the NeuralDBVectorStore class, which takes in a NeuralDB object to instantiate the state of NeuralDBVectorStore. Issue: N/A Dependencies: N/A Twitter handle: N/A 2024-02-22 17:05:01 +00:00			`self.db = db`
community: Added integrations for ThirdAI's NeuralDB with Retriever and VectorStore frameworks (#15280) Description: Adds ThirdAI NeuralDB retriever and vectorstore integration. NeuralDB is a CPU-friendly and fine-tunable text retrieval engine. 2024-01-29 16:35:42 +00:00
			`db: Any = None #: :meta private:`
			`"""NeuralDB instance"""`

			`class Config:`
			`"""Configuration for this pydantic object."""`

			`extra = Extra.forbid`
			`underscore_attrs_are_private = True`

			`@staticmethod`
infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`def _verify_thirdai_library(thirdai_key: Optional[str] = None): # type: ignore[no-untyped-def]`
community: Added integrations for ThirdAI's NeuralDB with Retriever and VectorStore frameworks (#15280) Description: Adds ThirdAI NeuralDB retriever and vectorstore integration. NeuralDB is a CPU-friendly and fine-tunable text retrieval engine. 2024-01-29 16:35:42 +00:00			`try:`
			`from thirdai import licensing`

			`importlib.util.find_spec("thirdai.neural_db")`

			`licensing.activate(thirdai_key or os.getenv("THIRDAI_KEY"))`
			`except ImportError:`
			`raise ModuleNotFoundError(`
			`"Could not import thirdai python package and neuraldb dependencies. "`
			"Please install it with `pip install thirdai[neural_db]`."
			`)`

			`@classmethod`
infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`def from_scratch( # type: ignore[no-untyped-def, no-untyped-def]`
community: Added integrations for ThirdAI's NeuralDB with Retriever and VectorStore frameworks (#15280) Description: Adds ThirdAI NeuralDB retriever and vectorstore integration. NeuralDB is a CPU-friendly and fine-tunable text retrieval engine. 2024-01-29 16:35:42 +00:00			`cls,`
			`thirdai_key: Optional[str] = None,`
			`**model_kwargs,`
			`):`
			`"""`
			`Create a NeuralDBVectorStore from scratch.`

			To use, set the ``THIRDAI_KEY`` environment variable with your ThirdAI
			API key, or pass ``thirdai_key`` as a named parameter.

			`Example:`
			`.. code-block:: python`

			`from langchain_community.vectorstores import NeuralDBVectorStore`

			`vectorstore = NeuralDBVectorStore.from_scratch(`
			`thirdai_key="your-thirdai-key",`
			`)`

			`vectorstore.insert([`
			`"/path/to/doc.pdf",`
			`"/path/to/doc.docx",`
			`"/path/to/doc.csv",`
			`])`

			`documents = vectorstore.similarity_search("AI-driven music therapy")`
			`"""`
			`NeuralDBVectorStore._verify_thirdai_library(thirdai_key)`
			`from thirdai import neural_db as ndb`

infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`return cls(db=ndb.NeuralDB(**model_kwargs)) # type: ignore[call-arg]`
community: Added integrations for ThirdAI's NeuralDB with Retriever and VectorStore frameworks (#15280) Description: Adds ThirdAI NeuralDB retriever and vectorstore integration. NeuralDB is a CPU-friendly and fine-tunable text retrieval engine. 2024-01-29 16:35:42 +00:00
			`@classmethod`
infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`def from_checkpoint( # type: ignore[no-untyped-def]`
community: Added integrations for ThirdAI's NeuralDB with Retriever and VectorStore frameworks (#15280) Description: Adds ThirdAI NeuralDB retriever and vectorstore integration. NeuralDB is a CPU-friendly and fine-tunable text retrieval engine. 2024-01-29 16:35:42 +00:00			`cls,`
			`checkpoint: Union[str, Path],`
			`thirdai_key: Optional[str] = None,`
			`):`
			`"""`
			`Create a NeuralDBVectorStore with a base model from a saved checkpoint`

			To use, set the ``THIRDAI_KEY`` environment variable with your ThirdAI
			API key, or pass ``thirdai_key`` as a named parameter.

			`Example:`
			`.. code-block:: python`

			`from langchain_community.vectorstores import NeuralDBVectorStore`

			`vectorstore = NeuralDBVectorStore.from_checkpoint(`
			`checkpoint="/path/to/checkpoint.ndb",`
			`thirdai_key="your-thirdai-key",`
			`)`

			`vectorstore.insert([`
			`"/path/to/doc.pdf",`
			`"/path/to/doc.docx",`
			`"/path/to/doc.csv",`
			`])`

			`documents = vectorstore.similarity_search("AI-driven music therapy")`
			`"""`
			`NeuralDBVectorStore._verify_thirdai_library(thirdai_key)`
			`from thirdai import neural_db as ndb`

infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`return cls(db=ndb.NeuralDB.from_checkpoint(checkpoint)) # type: ignore[call-arg]`
community: Added integrations for ThirdAI's NeuralDB with Retriever and VectorStore frameworks (#15280) Description: Adds ThirdAI NeuralDB retriever and vectorstore integration. NeuralDB is a CPU-friendly and fine-tunable text retrieval engine. 2024-01-29 16:35:42 +00:00
			`@classmethod`
			`def from_texts(`
			`cls,`
			`texts: List[str],`
			`embedding: Embeddings,`
			`metadatas: Optional[List[dict]] = None,`
			`**kwargs: Any,`
			`) -> "NeuralDBVectorStore":`
			`"""Return VectorStore initialized from texts and embeddings."""`
			`model_kwargs = {}`
			`if "thirdai_key" in kwargs:`
			`model_kwargs["thirdai_key"] = kwargs["thirdai_key"]`
			`del kwargs["thirdai_key"]`
			`vectorstore = cls.from_scratch(**model_kwargs)`
			`vectorstore.add_texts(texts, metadatas, **kwargs)`
			`return vectorstore`

			`def add_texts(`
			`self,`
			`texts: Iterable[str],`
			`metadatas: Optional[List[dict]] = None,`
			`**kwargs: Any,`
			`) -> List[str]:`
			`"""Run more texts through the embeddings and add to the vectorstore.`

			`Args:`
			`texts: Iterable of strings to add to the vectorstore.`
			`metadatas: Optional list of metadatas associated with the texts.`
			`kwargs: vectorstore specific parameters`

			`Returns:`
			`List of ids from adding the texts into the vectorstore.`
			`"""`
			`import pandas as pd`
			`from thirdai import neural_db as ndb`

			`df = pd.DataFrame({"texts": texts})`
			`if metadatas:`
			`df = pd.concat([df, pd.DataFrame.from_records(metadatas)], axis=1)`
infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`temp = tempfile.NamedTemporaryFile("w", delete=False, delete_on_close=False) # type: ignore[call-overload]`
community: Added integrations for ThirdAI's NeuralDB with Retriever and VectorStore frameworks (#15280) Description: Adds ThirdAI NeuralDB retriever and vectorstore integration. NeuralDB is a CPU-friendly and fine-tunable text retrieval engine. 2024-01-29 16:35:42 +00:00			`df.to_csv(temp)`
			`source_id = self.insert([ndb.CSV(temp.name)], **kwargs)[0]`
			`offset = self.db._savable_state.documents.get_source_by_id(source_id)[1]`
infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`return [str(offset + i) for i in range(len(texts))] # type: ignore[arg-type]`
community: Added integrations for ThirdAI's NeuralDB with Retriever and VectorStore frameworks (#15280) Description: Adds ThirdAI NeuralDB retriever and vectorstore integration. NeuralDB is a CPU-friendly and fine-tunable text retrieval engine. 2024-01-29 16:35:42 +00:00
			`@root_validator()`
			`def validate_environments(cls, values: Dict) -> Dict:`
			`"""Validate ThirdAI environment variables."""`
			`values["thirdai_key"] = convert_to_secret_str(`
			`get_from_dict_or_env(`
			`values,`
			`"thirdai_key",`
			`"THIRDAI_KEY",`
			`)`
			`)`
			`return values`

infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`def insert( # type: ignore[no-untyped-def, no-untyped-def]`
community: Added integrations for ThirdAI's NeuralDB with Retriever and VectorStore frameworks (#15280) Description: Adds ThirdAI NeuralDB retriever and vectorstore integration. NeuralDB is a CPU-friendly and fine-tunable text retrieval engine. 2024-01-29 16:35:42 +00:00			`self,`
			`sources: List[Any],`
			`train: bool = True,`
			`fast_mode: bool = True,`
			`**kwargs,`
			`):`
			`"""Inserts files / document sources into the vectorstore.`

			`Args:`
			`train: When True this means that the underlying model in the`
			`NeuralDB will undergo unsupervised pretraining on the inserted files.`
			`Defaults to True.`
			`fast_mode: Much faster insertion with a slight drop in performance.`
			`Defaults to True.`
			`"""`
			`sources = self._preprocess_sources(sources)`
			`self.db.insert(`
			`sources=sources,`
			`train=train,`
			`fast_approximation=fast_mode,`
			`**kwargs,`
			`)`

infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`def _preprocess_sources(self, sources): # type: ignore[no-untyped-def]`
community: Added integrations for ThirdAI's NeuralDB with Retriever and VectorStore frameworks (#15280) Description: Adds ThirdAI NeuralDB retriever and vectorstore integration. NeuralDB is a CPU-friendly and fine-tunable text retrieval engine. 2024-01-29 16:35:42 +00:00			`"""Checks if the provided sources are string paths. If they are, convert`
			`to NeuralDB document objects.`

			`Args:`
			`sources: list of either string paths to PDF, DOCX or CSV files, or`
			`NeuralDB document objects.`
			`"""`
			`from thirdai import neural_db as ndb`

			`if not sources:`
			`return sources`
			`preprocessed_sources = []`
			`for doc in sources:`
			`if not isinstance(doc, str):`
			`preprocessed_sources.append(doc)`
			`else:`
			`if doc.lower().endswith(".pdf"):`
			`preprocessed_sources.append(ndb.PDF(doc))`
			`elif doc.lower().endswith(".docx"):`
			`preprocessed_sources.append(ndb.DOCX(doc))`
			`elif doc.lower().endswith(".csv"):`
			`preprocessed_sources.append(ndb.CSV(doc))`
			`else:`
			`raise RuntimeError(`
			`f"Could not automatically load {doc}. Only files "`
			`"with .pdf, .docx, or .csv extensions can be loaded "`
			`"automatically. For other formats, please use the "`
			`"appropriate document object from the ThirdAI library."`
			`)`
			`return preprocessed_sources`

infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`def upvote(self, query: str, document_id: Union[int, str]): # type: ignore[no-untyped-def]`
community: Added integrations for ThirdAI's NeuralDB with Retriever and VectorStore frameworks (#15280) Description: Adds ThirdAI NeuralDB retriever and vectorstore integration. NeuralDB is a CPU-friendly and fine-tunable text retrieval engine. 2024-01-29 16:35:42 +00:00			`"""The vectorstore upweights the score of a document for a specific query.`
			`This is useful for fine-tuning the vectorstore to user behavior.`

			`Args:`
			query: text to associate with `document_id`
			`document_id: id of the document to associate query with.`
			`"""`
			`self.db.text_to_result(query, int(document_id))`

infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`def upvote_batch(self, query_id_pairs: List[Tuple[str, int]]): # type: ignore[no-untyped-def]`
community: Added integrations for ThirdAI's NeuralDB with Retriever and VectorStore frameworks (#15280) Description: Adds ThirdAI NeuralDB retriever and vectorstore integration. NeuralDB is a CPU-friendly and fine-tunable text retrieval engine. 2024-01-29 16:35:42 +00:00			`"""Given a batch of (query, document id) pairs, the vectorstore upweights`
			`the scores of the document for the corresponding queries.`
			`This is useful for fine-tuning the vectorstore to user behavior.`

			`Args:`
			`query_id_pairs: list of (query, document id) pairs. For each pair in`
			`this list, the model will upweight the document id for the query.`
			`"""`
			`self.db.text_to_result_batch(`
			`[(query, int(doc_id)) for query, doc_id in query_id_pairs]`
			`)`

infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`def associate(self, source: str, target: str): # type: ignore[no-untyped-def]`
community: Added integrations for ThirdAI's NeuralDB with Retriever and VectorStore frameworks (#15280) Description: Adds ThirdAI NeuralDB retriever and vectorstore integration. NeuralDB is a CPU-friendly and fine-tunable text retrieval engine. 2024-01-29 16:35:42 +00:00			`"""The vectorstore associates a source phrase with a target phrase.`
			`When the vectorstore sees the source phrase, it will also consider results`
			`that are relevant to the target phrase.`

			`Args:`
			source: text to associate to `target`.
			target: text to associate `source` to.
			`"""`
			`self.db.associate(source, target)`

infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`def associate_batch(self, text_pairs: List[Tuple[str, str]]): # type: ignore[no-untyped-def]`
community: Added integrations for ThirdAI's NeuralDB with Retriever and VectorStore frameworks (#15280) Description: Adds ThirdAI NeuralDB retriever and vectorstore integration. NeuralDB is a CPU-friendly and fine-tunable text retrieval engine. 2024-01-29 16:35:42 +00:00			`"""Given a batch of (source, target) pairs, the vectorstore associates`
			`each source phrase with the corresponding target phrase.`

			`Args:`
			`text_pairs: list of (source, target) text pairs. For each pair in`
			`this list, the source will be associated with the target.`
			`"""`
			`self.db.associate_batch(text_pairs)`

			`def similarity_search(`
			`self, query: str, k: int = 10, **kwargs: Any`
			`) -> List[Document]:`
			`"""Retrieve {k} contexts with for a given query`

			`Args:`
			`query: Query to submit to the model`
			`k: The max number of context results to retrieve. Defaults to 10.`
			`"""`
			`try:`
			`references = self.db.search(query=query, top_k=k, **kwargs)`
			`return [`
			`Document(`
			`page_content=ref.text,`
			`metadata={`
			`"id": ref.id,`
			`"upvote_ids": ref.upvote_ids,`
			`"source": ref.source,`
			`"metadata": ref.metadata,`
			`"score": ref.score,`
			`"context": ref.context(1),`
			`},`
			`)`
			`for ref in references`
			`]`
			`except Exception as e:`
			`raise ValueError(f"Error while retrieving documents: {e}") from e`

infra: add -p to mkdir in lint steps (#17013) Previously, if this did not find a mypy cache then it wouldnt run this makes it always run adding mypy ignore comments with existing uncaught issues to unblock other prs --------- Co-authored-by: Erick Friis <erick@langchain.dev> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> 2024-02-05 19:22:06 +00:00			`def save(self, path: str): # type: ignore[no-untyped-def]`
community: Added integrations for ThirdAI's NeuralDB with Retriever and VectorStore frameworks (#15280) Description: Adds ThirdAI NeuralDB retriever and vectorstore integration. NeuralDB is a CPU-friendly and fine-tunable text retrieval engine. 2024-01-29 16:35:42 +00:00			`"""Saves a NeuralDB instance to disk. Can be loaded into memory by`
			`calling NeuralDB.from_checkpoint(path)`

			`Args:`
			`path: path on disk to save the NeuralDB instance to.`
			`"""`
			`self.db.save(path)`