From 5f13668fa08449c846eb69e9ccb98e177f019bb6 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Mon, 25 Sep 2023 12:44:23 -0700 Subject: [PATCH] Harrison/move vectorstore base (#11030) --- .../agents/autonomous_agents/baby_agi.ipynb | 2 +- .../baby_agi_with_agent.ipynb | 2 +- .../autonomous_agents/autogpt/agent.py | 2 +- .../autonomous_agents/autogpt/memory.py | 2 +- .../autonomous_agents/autogpt/prompt.py | 2 +- .../autonomous_agents/baby_agi/baby_agi.py | 2 +- .../agent_toolkits/vectorstore/toolkit.py | 2 +- .../chains/conversational_retrieval/base.py | 2 +- .../chains/qa_with_sources/vector_db.py | 2 +- .../langchain/chains/retrieval_qa/base.py | 4 +- .../chains/router/embedding_router.py | 2 +- libs/langchain/langchain/indexes/_api.py | 2 +- .../langchain/indexes/vectorstore.py | 2 +- .../langchain/langchain/memory/vectorstore.py | 2 +- .../example_selector/semantic_similarity.py | 2 +- .../retrievers/time_weighted_retriever.py | 2 +- .../langchain/retrievers/web_research.py | 2 +- .../langchain/langchain/schema/vectorstore.py | 611 ++++++++++++++++++ .../langchain/tools/vectorstore/tool.py | 2 +- .../langchain/vectorstores/__init__.py | 2 +- .../vectorstores/alibabacloud_opensearch.py | 2 +- .../langchain/vectorstores/analyticdb.py | 2 +- .../langchain/langchain/vectorstores/annoy.py | 2 +- .../langchain/langchain/vectorstores/atlas.py | 2 +- .../langchain/langchain/vectorstores/awadb.py | 2 +- .../langchain/vectorstores/azuresearch.py | 2 +- .../langchain/vectorstores/bageldb.py | 2 +- libs/langchain/langchain/vectorstores/base.py | 609 +---------------- .../langchain/vectorstores/cassandra.py | 2 +- .../langchain/vectorstores/chroma.py | 2 +- .../langchain/vectorstores/clarifai.py | 2 +- .../langchain/vectorstores/clickhouse.py | 2 +- .../langchain/vectorstores/dashvector.py | 2 +- .../langchain/vectorstores/deeplake.py | 2 +- .../langchain/langchain/vectorstores/dingo.py | 2 +- .../vectorstores/elastic_vector_search.py | 2 +- .../langchain/vectorstores/elasticsearch.py | 2 +- .../langchain/vectorstores/epsilla.py | 2 +- .../langchain/langchain/vectorstores/faiss.py | 2 +- .../langchain/vectorstores/hologres.py | 2 +- .../langchain/vectorstores/lancedb.py | 2 +- .../langchain/vectorstores/llm_rails.py | 2 +- .../langchain/langchain/vectorstores/marqo.py | 2 +- .../langchain/vectorstores/matching_engine.py | 2 +- .../langchain/vectorstores/meilisearch.py | 2 +- .../langchain/vectorstores/milvus.py | 2 +- .../langchain/vectorstores/mongodb_atlas.py | 2 +- .../langchain/vectorstores/myscale.py | 2 +- .../langchain/vectorstores/neo4j_vector.py | 2 +- .../langchain/vectorstores/nucliadb.py | 2 +- .../vectorstores/opensearch_vector_search.py | 2 +- .../langchain/vectorstores/pgembedding.py | 2 +- .../langchain/vectorstores/pgvector.py | 2 +- .../langchain/vectorstores/pinecone.py | 2 +- .../langchain/vectorstores/redis/base.py | 2 +- .../langchain/vectorstores/rocksetdb.py | 2 +- .../langchain/langchain/vectorstores/scann.py | 2 +- .../langchain/vectorstores/singlestoredb.py | 2 +- .../langchain/vectorstores/sklearn.py | 2 +- .../langchain/vectorstores/sqlitevss.py | 2 +- .../langchain/vectorstores/starrocks.py | 2 +- .../langchain/vectorstores/supabase.py | 2 +- libs/langchain/langchain/vectorstores/tair.py | 2 +- .../langchain/vectorstores/tencentvectordb.py | 2 +- .../langchain/vectorstores/timescalevector.py | 2 +- .../langchain/vectorstores/typesense.py | 2 +- .../langchain/vectorstores/usearch.py | 2 +- libs/langchain/langchain/vectorstores/vald.py | 2 +- .../langchain/vectorstores/vearch.py | 2 +- .../langchain/vectorstores/vectara.py | 2 +- .../langchain/vectorstores/weaviate.py | 2 +- libs/langchain/langchain/vectorstores/xata.py | 2 +- libs/langchain/langchain/vectorstores/zep.py | 2 +- .../tests/unit_tests/indexes/test_indexing.py | 2 +- .../test_time_weighted_retriever.py | 2 +- 75 files changed, 687 insertions(+), 681 deletions(-) create mode 100644 libs/langchain/langchain/schema/vectorstore.py diff --git a/docs/extras/use_cases/more/agents/autonomous_agents/baby_agi.ipynb b/docs/extras/use_cases/more/agents/autonomous_agents/baby_agi.ipynb index b0bb79e1d4..c30d2fceeb 100644 --- a/docs/extras/use_cases/more/agents/autonomous_agents/baby_agi.ipynb +++ b/docs/extras/use_cases/more/agents/autonomous_agents/baby_agi.ipynb @@ -36,7 +36,7 @@ "from langchain.chains import LLMChain\nfrom langchain.llms import OpenAI\nfrom langchain.prompts import PromptTemplate\n", "from langchain.embeddings import OpenAIEmbeddings\n", "from langchain.llms import BaseLLM\n", - "from langchain.vectorstores.base import VectorStore\n", + "from langchain.schema.vectorstore import VectorStore\n", "from pydantic import BaseModel, Field\n", "from langchain.chains.base import Chain\n", "from langchain_experimental.autonomous_agents import BabyAGI" diff --git a/docs/extras/use_cases/more/agents/autonomous_agents/baby_agi_with_agent.ipynb b/docs/extras/use_cases/more/agents/autonomous_agents/baby_agi_with_agent.ipynb index bf03f95e0c..b3cc5db10a 100644 --- a/docs/extras/use_cases/more/agents/autonomous_agents/baby_agi_with_agent.ipynb +++ b/docs/extras/use_cases/more/agents/autonomous_agents/baby_agi_with_agent.ipynb @@ -32,7 +32,7 @@ "from langchain.chains import LLMChain\nfrom langchain.llms import OpenAI\nfrom langchain.prompts import PromptTemplate\n", "from langchain.embeddings import OpenAIEmbeddings\n", "from langchain.llms import BaseLLM\n", - "from langchain.vectorstores.base import VectorStore\n", + "from langchain.schema.vectorstore import VectorStore\n", "from pydantic import BaseModel, Field\n", "from langchain.chains.base import Chain\n", "from langchain_experimental.autonomous_agents import BabyAGI" diff --git a/libs/experimental/langchain_experimental/autonomous_agents/autogpt/agent.py b/libs/experimental/langchain_experimental/autonomous_agents/autogpt/agent.py index 5872f0fd74..bffd6b2c21 100644 --- a/libs/experimental/langchain_experimental/autonomous_agents/autogpt/agent.py +++ b/libs/experimental/langchain_experimental/autonomous_agents/autogpt/agent.py @@ -10,9 +10,9 @@ from langchain.schema import ( Document, ) from langchain.schema.messages import AIMessage, HumanMessage, SystemMessage +from langchain.schema.vectorstore import VectorStoreRetriever from langchain.tools.base import BaseTool from langchain.tools.human.tool import HumanInputRun -from langchain.vectorstores.base import VectorStoreRetriever from langchain_experimental.autonomous_agents.autogpt.output_parser import ( AutoGPTOutputParser, diff --git a/libs/experimental/langchain_experimental/autonomous_agents/autogpt/memory.py b/libs/experimental/langchain_experimental/autonomous_agents/autogpt/memory.py index 549a995563..41f8339b1e 100644 --- a/libs/experimental/langchain_experimental/autonomous_agents/autogpt/memory.py +++ b/libs/experimental/langchain_experimental/autonomous_agents/autogpt/memory.py @@ -1,7 +1,7 @@ from typing import Any, Dict, List from langchain.memory.chat_memory import BaseChatMemory, get_prompt_input_key -from langchain.vectorstores.base import VectorStoreRetriever +from langchain.schema.vectorstore import VectorStoreRetriever from langchain_experimental.pydantic_v1 import Field diff --git a/libs/experimental/langchain_experimental/autonomous_agents/autogpt/prompt.py b/libs/experimental/langchain_experimental/autonomous_agents/autogpt/prompt.py index 68da11b0c3..af27b289c6 100644 --- a/libs/experimental/langchain_experimental/autonomous_agents/autogpt/prompt.py +++ b/libs/experimental/langchain_experimental/autonomous_agents/autogpt/prompt.py @@ -5,8 +5,8 @@ from langchain.prompts.chat import ( BaseChatPromptTemplate, ) from langchain.schema.messages import BaseMessage, HumanMessage, SystemMessage +from langchain.schema.vectorstore import VectorStoreRetriever from langchain.tools.base import BaseTool -from langchain.vectorstores.base import VectorStoreRetriever from langchain_experimental.autonomous_agents.autogpt.prompt_generator import get_prompt from langchain_experimental.pydantic_v1 import BaseModel diff --git a/libs/experimental/langchain_experimental/autonomous_agents/baby_agi/baby_agi.py b/libs/experimental/langchain_experimental/autonomous_agents/baby_agi/baby_agi.py index 6dd5d15b14..9fef44f0e6 100644 --- a/libs/experimental/langchain_experimental/autonomous_agents/baby_agi/baby_agi.py +++ b/libs/experimental/langchain_experimental/autonomous_agents/baby_agi/baby_agi.py @@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional from langchain.callbacks.manager import CallbackManagerForChainRun from langchain.chains.base import Chain from langchain.schema.language_model import BaseLanguageModel -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore from langchain_experimental.autonomous_agents.baby_agi.task_creation import ( TaskCreationChain, diff --git a/libs/langchain/langchain/agents/agent_toolkits/vectorstore/toolkit.py b/libs/langchain/langchain/agents/agent_toolkits/vectorstore/toolkit.py index c9399a9359..724a9d4558 100644 --- a/libs/langchain/langchain/agents/agent_toolkits/vectorstore/toolkit.py +++ b/libs/langchain/langchain/agents/agent_toolkits/vectorstore/toolkit.py @@ -5,12 +5,12 @@ from langchain.agents.agent_toolkits.base import BaseToolkit from langchain.llms.openai import OpenAI from langchain.pydantic_v1 import BaseModel, Field from langchain.schema.language_model import BaseLanguageModel +from langchain.schema.vectorstore import VectorStore from langchain.tools import BaseTool from langchain.tools.vectorstore.tool import ( VectorStoreQATool, VectorStoreQAWithSourcesTool, ) -from langchain.vectorstores.base import VectorStore class VectorStoreInfo(BaseModel): diff --git a/libs/langchain/langchain/chains/conversational_retrieval/base.py b/libs/langchain/langchain/chains/conversational_retrieval/base.py index 911094e6d9..a8beab3e9e 100644 --- a/libs/langchain/langchain/chains/conversational_retrieval/base.py +++ b/libs/langchain/langchain/chains/conversational_retrieval/base.py @@ -22,7 +22,7 @@ from langchain.pydantic_v1 import Extra, Field, root_validator from langchain.schema import BasePromptTemplate, BaseRetriever, Document from langchain.schema.language_model import BaseLanguageModel from langchain.schema.messages import BaseMessage -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore # Depending on the memory type and configuration, the chat history format may differ. # This needs to be consolidated. diff --git a/libs/langchain/langchain/chains/qa_with_sources/vector_db.py b/libs/langchain/langchain/chains/qa_with_sources/vector_db.py index 19a491fcb1..44659d9170 100644 --- a/libs/langchain/langchain/chains/qa_with_sources/vector_db.py +++ b/libs/langchain/langchain/chains/qa_with_sources/vector_db.py @@ -11,7 +11,7 @@ from langchain.chains.combine_documents.stuff import StuffDocumentsChain from langchain.chains.qa_with_sources.base import BaseQAWithSourcesChain from langchain.docstore.document import Document from langchain.pydantic_v1 import Field, root_validator -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore class VectorDBQAWithSourcesChain(BaseQAWithSourcesChain): diff --git a/libs/langchain/langchain/chains/retrieval_qa/base.py b/libs/langchain/langchain/chains/retrieval_qa/base.py index 481aab5a3b..6a506bad69 100644 --- a/libs/langchain/langchain/chains/retrieval_qa/base.py +++ b/libs/langchain/langchain/chains/retrieval_qa/base.py @@ -21,7 +21,7 @@ from langchain.prompts import PromptTemplate from langchain.pydantic_v1 import Extra, Field, root_validator from langchain.schema import BaseRetriever, Document from langchain.schema.language_model import BaseLanguageModel -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore class BaseRetrievalQA(Chain): @@ -198,7 +198,7 @@ class RetrievalQA(BaseRetrievalQA): from langchain.llms import OpenAI from langchain.chains import RetrievalQA from langchain.faiss import FAISS - from langchain.vectorstores.base import VectorStoreRetriever + from langchain.schema.vectorstore import VectorStoreRetriever retriever = VectorStoreRetriever(vectorstore=FAISS(...)) retrievalQA = RetrievalQA.from_llm(llm=OpenAI(), retriever=retriever) diff --git a/libs/langchain/langchain/chains/router/embedding_router.py b/libs/langchain/langchain/chains/router/embedding_router.py index 6042f07a11..1f7a716076 100644 --- a/libs/langchain/langchain/chains/router/embedding_router.py +++ b/libs/langchain/langchain/chains/router/embedding_router.py @@ -7,7 +7,7 @@ from langchain.chains.router.base import RouterChain from langchain.docstore.document import Document from langchain.pydantic_v1 import Extra from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore class EmbeddingRouterChain(RouterChain): diff --git a/libs/langchain/langchain/indexes/_api.py b/libs/langchain/langchain/indexes/_api.py index c62f0e1ed7..88ab4e6a9a 100644 --- a/libs/langchain/langchain/indexes/_api.py +++ b/libs/langchain/langchain/indexes/_api.py @@ -25,7 +25,7 @@ from langchain.document_loaders.base import BaseLoader from langchain.indexes.base import NAMESPACE_UUID, RecordManager from langchain.pydantic_v1 import root_validator from langchain.schema import Document -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore T = TypeVar("T") diff --git a/libs/langchain/langchain/indexes/vectorstore.py b/libs/langchain/langchain/indexes/vectorstore.py index 5cae05c711..d15fb7a734 100644 --- a/libs/langchain/langchain/indexes/vectorstore.py +++ b/libs/langchain/langchain/indexes/vectorstore.py @@ -9,8 +9,8 @@ from langchain.pydantic_v1 import BaseModel, Extra, Field from langchain.schema import Document from langchain.schema.embeddings import Embeddings from langchain.schema.language_model import BaseLanguageModel +from langchain.schema.vectorstore import VectorStore from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter -from langchain.vectorstores.base import VectorStore from langchain.vectorstores.chroma import Chroma diff --git a/libs/langchain/langchain/memory/vectorstore.py b/libs/langchain/langchain/memory/vectorstore.py index b368fbab38..a35e74b6b3 100644 --- a/libs/langchain/langchain/memory/vectorstore.py +++ b/libs/langchain/langchain/memory/vectorstore.py @@ -6,7 +6,7 @@ from langchain.memory.chat_memory import BaseMemory from langchain.memory.utils import get_prompt_input_key from langchain.pydantic_v1 import Field from langchain.schema import Document -from langchain.vectorstores.base import VectorStoreRetriever +from langchain.schema.vectorstore import VectorStoreRetriever class VectorStoreRetrieverMemory(BaseMemory): diff --git a/libs/langchain/langchain/prompts/example_selector/semantic_similarity.py b/libs/langchain/langchain/prompts/example_selector/semantic_similarity.py index 3409c1896b..4548b3b287 100644 --- a/libs/langchain/langchain/prompts/example_selector/semantic_similarity.py +++ b/libs/langchain/langchain/prompts/example_selector/semantic_similarity.py @@ -6,7 +6,7 @@ from typing import Any, Dict, List, Optional, Type from langchain.prompts.example_selector.base import BaseExampleSelector from langchain.pydantic_v1 import BaseModel, Extra from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore def sorted_values(values: Dict[str, str]) -> List[Any]: diff --git a/libs/langchain/langchain/retrievers/time_weighted_retriever.py b/libs/langchain/langchain/retrievers/time_weighted_retriever.py index 75b49a8c37..644a5a7dec 100644 --- a/libs/langchain/langchain/retrievers/time_weighted_retriever.py +++ b/libs/langchain/langchain/retrievers/time_weighted_retriever.py @@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple from langchain.callbacks.manager import CallbackManagerForRetrieverRun from langchain.pydantic_v1 import Field from langchain.schema import BaseRetriever, Document -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore def _get_hours_passed(time: datetime.datetime, ref_time: datetime.datetime) -> float: diff --git a/libs/langchain/langchain/retrievers/web_research.py b/libs/langchain/langchain/retrievers/web_research.py index cf53243c66..ac8ecb6c75 100644 --- a/libs/langchain/langchain/retrievers/web_research.py +++ b/libs/langchain/langchain/retrievers/web_research.py @@ -16,9 +16,9 @@ from langchain.output_parsers.pydantic import PydanticOutputParser from langchain.prompts import BasePromptTemplate, PromptTemplate from langchain.pydantic_v1 import BaseModel, Field from langchain.schema import BaseRetriever, Document +from langchain.schema.vectorstore import VectorStore from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter from langchain.utilities import GoogleSearchAPIWrapper -from langchain.vectorstores.base import VectorStore logger = logging.getLogger(__name__) diff --git a/libs/langchain/langchain/schema/vectorstore.py b/libs/langchain/langchain/schema/vectorstore.py new file mode 100644 index 0000000000..68c7f94c7f --- /dev/null +++ b/libs/langchain/langchain/schema/vectorstore.py @@ -0,0 +1,611 @@ +from __future__ import annotations + +import asyncio +import logging +import math +import warnings +from abc import ABC, abstractmethod +from functools import partial +from typing import ( + TYPE_CHECKING, + Any, + Callable, + ClassVar, + Collection, + Dict, + Iterable, + List, + Optional, + Tuple, + Type, + TypeVar, +) + +from langchain.pydantic_v1 import Field, root_validator +from langchain.schema import BaseRetriever +from langchain.schema.document import Document +from langchain.schema.embeddings import Embeddings + +if TYPE_CHECKING: + from langchain.callbacks.manager import ( + AsyncCallbackManagerForRetrieverRun, + CallbackManagerForRetrieverRun, + ) + +logger = logging.getLogger(__name__) + +VST = TypeVar("VST", bound="VectorStore") + + +class VectorStore(ABC): + """Interface for vector store.""" + + @abstractmethod + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore. + + Args: + texts: Iterable of strings to add to the vectorstore. + metadatas: Optional list of metadatas associated with the texts. + kwargs: vectorstore specific parameters + + Returns: + List of ids from adding the texts into the vectorstore. + """ + + @property + def embeddings(self) -> Optional[Embeddings]: + """Access the query embedding object if available.""" + logger.debug( + f"{Embeddings.__name__} is not implemented for {self.__class__.__name__}" + ) + return None + + def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]: + """Delete by vector ID or other criteria. + + Args: + ids: List of ids to delete. + **kwargs: Other keyword arguments that subclasses might use. + + Returns: + Optional[bool]: True if deletion is successful, + False otherwise, None if not implemented. + """ + + raise NotImplementedError("delete method must be implemented by subclass.") + + async def aadd_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + """Run more texts through the embeddings and add to the vectorstore.""" + raise NotImplementedError + + def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]: + """Run more documents through the embeddings and add to the vectorstore. + + Args: + documents (List[Document]: Documents to add to the vectorstore. + + Returns: + List[str]: List of IDs of the added texts. + """ + # TODO: Handle the case where the user doesn't provide ids on the Collection + texts = [doc.page_content for doc in documents] + metadatas = [doc.metadata for doc in documents] + return self.add_texts(texts, metadatas, **kwargs) + + async def aadd_documents( + self, documents: List[Document], **kwargs: Any + ) -> List[str]: + """Run more documents through the embeddings and add to the vectorstore. + + Args: + documents (List[Document]: Documents to add to the vectorstore. + + Returns: + List[str]: List of IDs of the added texts. + """ + texts = [doc.page_content for doc in documents] + metadatas = [doc.metadata for doc in documents] + return await self.aadd_texts(texts, metadatas, **kwargs) + + def search(self, query: str, search_type: str, **kwargs: Any) -> List[Document]: + """Return docs most similar to query using specified search type.""" + if search_type == "similarity": + return self.similarity_search(query, **kwargs) + elif search_type == "mmr": + return self.max_marginal_relevance_search(query, **kwargs) + else: + raise ValueError( + f"search_type of {search_type} not allowed. Expected " + "search_type to be 'similarity' or 'mmr'." + ) + + async def asearch( + self, query: str, search_type: str, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to query using specified search type.""" + if search_type == "similarity": + return await self.asimilarity_search(query, **kwargs) + elif search_type == "mmr": + return await self.amax_marginal_relevance_search(query, **kwargs) + else: + raise ValueError( + f"search_type of {search_type} not allowed. Expected " + "search_type to be 'similarity' or 'mmr'." + ) + + @abstractmethod + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to query.""" + + @staticmethod + def _euclidean_relevance_score_fn(distance: float) -> float: + """Return a similarity score on a scale [0, 1].""" + # The 'correct' relevance function + # may differ depending on a few things, including: + # - the distance / similarity metric used by the VectorStore + # - the scale of your embeddings (OpenAI's are unit normed. Many + # others are not!) + # - embedding dimensionality + # - etc. + # This function converts the euclidean norm of normalized embeddings + # (0 is most similar, sqrt(2) most dissimilar) + # to a similarity function (0 to 1) + return 1.0 - distance / math.sqrt(2) + + @staticmethod + def _cosine_relevance_score_fn(distance: float) -> float: + """Normalize the distance to a score on a scale [0, 1].""" + + return 1.0 - distance + + @staticmethod + def _max_inner_product_relevance_score_fn(distance: float) -> float: + """Normalize the distance to a score on a scale [0, 1].""" + if distance > 0: + return 1.0 - distance + + return -1.0 * distance + + def _select_relevance_score_fn(self) -> Callable[[float], float]: + """ + The 'correct' relevance function + may differ depending on a few things, including: + - the distance / similarity metric used by the VectorStore + - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) + - embedding dimensionality + - etc. + + Vectorstores should define their own selection based method of relevance. + """ + raise NotImplementedError + + def similarity_search_with_score( + self, *args: Any, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Run similarity search with distance.""" + raise NotImplementedError + + def _similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """ + Default similarity search with relevance scores. Modify if necessary + in subclass. + Return docs and relevance scores in the range [0, 1]. + + 0 is dissimilar, 1 is most similar. + + Args: + query: input text + k: Number of Documents to return. Defaults to 4. + **kwargs: kwargs to be passed to similarity search. Should include: + score_threshold: Optional, a floating point value between 0 to 1 to + filter the resulting set of retrieved docs + + Returns: + List of Tuples of (doc, similarity_score) + """ + relevance_score_fn = self._select_relevance_score_fn() + docs_and_scores = self.similarity_search_with_score(query, k, **kwargs) + return [(doc, relevance_score_fn(score)) for doc, score in docs_and_scores] + + def similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and relevance scores in the range [0, 1]. + + 0 is dissimilar, 1 is most similar. + + Args: + query: input text + k: Number of Documents to return. Defaults to 4. + **kwargs: kwargs to be passed to similarity search. Should include: + score_threshold: Optional, a floating point value between 0 to 1 to + filter the resulting set of retrieved docs + + Returns: + List of Tuples of (doc, similarity_score) + """ + score_threshold = kwargs.pop("score_threshold", None) + + docs_and_similarities = self._similarity_search_with_relevance_scores( + query, k=k, **kwargs + ) + if any( + similarity < 0.0 or similarity > 1.0 + for _, similarity in docs_and_similarities + ): + warnings.warn( + "Relevance scores must be between" + f" 0 and 1, got {docs_and_similarities}" + ) + + if score_threshold is not None: + docs_and_similarities = [ + (doc, similarity) + for doc, similarity in docs_and_similarities + if similarity >= score_threshold + ] + if len(docs_and_similarities) == 0: + warnings.warn( + "No relevant docs were retrieved using the relevance score" + f" threshold {score_threshold}" + ) + return docs_and_similarities + + async def asimilarity_search_with_relevance_scores( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Tuple[Document, float]]: + """Return docs most similar to query.""" + + # This is a temporary workaround to make the similarity search + # asynchronous. The proper solution is to make the similarity search + # asynchronous in the vector store implementations. + func = partial( + self.similarity_search_with_relevance_scores, query, k=k, **kwargs + ) + return await asyncio.get_event_loop().run_in_executor(None, func) + + async def asimilarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to query.""" + + # This is a temporary workaround to make the similarity search + # asynchronous. The proper solution is to make the similarity search + # asynchronous in the vector store implementations. + func = partial(self.similarity_search, query, k=k, **kwargs) + return await asyncio.get_event_loop().run_in_executor(None, func) + + def similarity_search_by_vector( + self, embedding: List[float], k: int = 4, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to embedding vector. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + + Returns: + List of Documents most similar to the query vector. + """ + raise NotImplementedError + + async def asimilarity_search_by_vector( + self, embedding: List[float], k: int = 4, **kwargs: Any + ) -> List[Document]: + """Return docs most similar to embedding vector.""" + + # This is a temporary workaround to make the similarity search + # asynchronous. The proper solution is to make the similarity search + # asynchronous in the vector store implementations. + func = partial(self.similarity_search_by_vector, embedding, k=k, **kwargs) + return await asyncio.get_event_loop().run_in_executor(None, func) + + def max_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query: Text to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: + List of Documents selected by maximal marginal relevance. + """ + raise NotImplementedError + + async def amax_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance.""" + + # This is a temporary workaround to make the similarity search + # asynchronous. The proper solution is to make the similarity search + # asynchronous in the vector store implementations. + func = partial( + self.max_marginal_relevance_search, + query, + k=k, + fetch_k=fetch_k, + lambda_mult=lambda_mult, + **kwargs, + ) + return await asyncio.get_event_loop().run_in_executor(None, func) + + def max_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + embedding: Embedding to look up documents similar to. + k: Number of Documents to return. Defaults to 4. + fetch_k: Number of Documents to fetch to pass to MMR algorithm. + lambda_mult: Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + Returns: + List of Documents selected by maximal marginal relevance. + """ + raise NotImplementedError + + async def amax_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance.""" + raise NotImplementedError + + @classmethod + def from_documents( + cls: Type[VST], + documents: List[Document], + embedding: Embeddings, + **kwargs: Any, + ) -> VST: + """Return VectorStore initialized from documents and embeddings.""" + texts = [d.page_content for d in documents] + metadatas = [d.metadata for d in documents] + return cls.from_texts(texts, embedding, metadatas=metadatas, **kwargs) + + @classmethod + async def afrom_documents( + cls: Type[VST], + documents: List[Document], + embedding: Embeddings, + **kwargs: Any, + ) -> VST: + """Return VectorStore initialized from documents and embeddings.""" + texts = [d.page_content for d in documents] + metadatas = [d.metadata for d in documents] + return await cls.afrom_texts(texts, embedding, metadatas=metadatas, **kwargs) + + @classmethod + @abstractmethod + def from_texts( + cls: Type[VST], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> VST: + """Return VectorStore initialized from texts and embeddings.""" + + @classmethod + async def afrom_texts( + cls: Type[VST], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> VST: + """Return VectorStore initialized from texts and embeddings.""" + raise NotImplementedError + + def _get_retriever_tags(self) -> List[str]: + """Get tags for retriever.""" + tags = [self.__class__.__name__] + if self.embeddings: + tags.append(self.embeddings.__class__.__name__) + return tags + + def as_retriever(self, **kwargs: Any) -> VectorStoreRetriever: + """Return VectorStoreRetriever initialized from this VectorStore. + + Args: + search_type (Optional[str]): Defines the type of search that + the Retriever should perform. + Can be "similarity" (default), "mmr", or + "similarity_score_threshold". + search_kwargs (Optional[Dict]): Keyword arguments to pass to the + search function. Can include things like: + k: Amount of documents to return (Default: 4) + score_threshold: Minimum relevance threshold + for similarity_score_threshold + fetch_k: Amount of documents to pass to MMR algorithm (Default: 20) + lambda_mult: Diversity of results returned by MMR; + 1 for minimum diversity and 0 for maximum. (Default: 0.5) + filter: Filter by document metadata + + Returns: + VectorStoreRetriever: Retriever class for VectorStore. + + Examples: + + .. code-block:: python + + # Retrieve more documents with higher diversity + # Useful if your dataset has many similar documents + docsearch.as_retriever( + search_type="mmr", + search_kwargs={'k': 6, 'lambda_mult': 0.25} + ) + + # Fetch more documents for the MMR algorithm to consider + # But only return the top 5 + docsearch.as_retriever( + search_type="mmr", + search_kwargs={'k': 5, 'fetch_k': 50} + ) + + # Only retrieve documents that have a relevance score + # Above a certain threshold + docsearch.as_retriever( + search_type="similarity_score_threshold", + search_kwargs={'score_threshold': 0.8} + ) + + # Only get the single most similar document from the dataset + docsearch.as_retriever(search_kwargs={'k': 1}) + + # Use a filter to only retrieve documents from a specific paper + docsearch.as_retriever( + search_kwargs={'filter': {'paper_title':'GPT-4 Technical Report'}} + ) + """ + tags = kwargs.pop("tags", None) or [] + tags.extend(self._get_retriever_tags()) + + return VectorStoreRetriever(vectorstore=self, **kwargs, tags=tags) + + +class VectorStoreRetriever(BaseRetriever): + """Base Retriever class for VectorStore.""" + + vectorstore: VectorStore + """VectorStore to use for retrieval.""" + search_type: str = "similarity" + """Type of search to perform. Defaults to "similarity".""" + search_kwargs: dict = Field(default_factory=dict) + """Keyword arguments to pass to the search function.""" + allowed_search_types: ClassVar[Collection[str]] = ( + "similarity", + "similarity_score_threshold", + "mmr", + ) + + class Config: + """Configuration for this pydantic object.""" + + arbitrary_types_allowed = True + + @root_validator() + def validate_search_type(cls, values: Dict) -> Dict: + """Validate search type.""" + search_type = values["search_type"] + if search_type not in cls.allowed_search_types: + raise ValueError( + f"search_type of {search_type} not allowed. Valid values are: " + f"{cls.allowed_search_types}" + ) + if search_type == "similarity_score_threshold": + score_threshold = values["search_kwargs"].get("score_threshold") + if (score_threshold is None) or (not isinstance(score_threshold, float)): + raise ValueError( + "`score_threshold` is not specified with a float value(0~1) " + "in `search_kwargs`." + ) + return values + + def _get_relevant_documents( + self, query: str, *, run_manager: CallbackManagerForRetrieverRun + ) -> List[Document]: + if self.search_type == "similarity": + docs = self.vectorstore.similarity_search(query, **self.search_kwargs) + elif self.search_type == "similarity_score_threshold": + docs_and_similarities = ( + self.vectorstore.similarity_search_with_relevance_scores( + query, **self.search_kwargs + ) + ) + docs = [doc for doc, _ in docs_and_similarities] + elif self.search_type == "mmr": + docs = self.vectorstore.max_marginal_relevance_search( + query, **self.search_kwargs + ) + else: + raise ValueError(f"search_type of {self.search_type} not allowed.") + return docs + + async def _aget_relevant_documents( + self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun + ) -> List[Document]: + if self.search_type == "similarity": + docs = await self.vectorstore.asimilarity_search( + query, **self.search_kwargs + ) + elif self.search_type == "similarity_score_threshold": + docs_and_similarities = ( + await self.vectorstore.asimilarity_search_with_relevance_scores( + query, **self.search_kwargs + ) + ) + docs = [doc for doc, _ in docs_and_similarities] + elif self.search_type == "mmr": + docs = await self.vectorstore.amax_marginal_relevance_search( + query, **self.search_kwargs + ) + else: + raise ValueError(f"search_type of {self.search_type} not allowed.") + return docs + + def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]: + """Add documents to vectorstore.""" + return self.vectorstore.add_documents(documents, **kwargs) + + async def aadd_documents( + self, documents: List[Document], **kwargs: Any + ) -> List[str]: + """Add documents to vectorstore.""" + return await self.vectorstore.aadd_documents(documents, **kwargs) diff --git a/libs/langchain/langchain/tools/vectorstore/tool.py b/libs/langchain/langchain/tools/vectorstore/tool.py index 59c4be604a..02a6da1c7a 100644 --- a/libs/langchain/langchain/tools/vectorstore/tool.py +++ b/libs/langchain/langchain/tools/vectorstore/tool.py @@ -8,8 +8,8 @@ from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain from langchain.llms.openai import OpenAI from langchain.pydantic_v1 import BaseModel, Field from langchain.schema.language_model import BaseLanguageModel +from langchain.schema.vectorstore import VectorStore from langchain.tools.base import BaseTool -from langchain.vectorstores.base import VectorStore class BaseVectorStoreTool(BaseModel): diff --git a/libs/langchain/langchain/vectorstores/__init__.py b/libs/langchain/langchain/vectorstores/__init__.py index 18a24b20b0..0b113e04e8 100644 --- a/libs/langchain/langchain/vectorstores/__init__.py +++ b/libs/langchain/langchain/vectorstores/__init__.py @@ -18,6 +18,7 @@ and retrieve the data that are 'most similar' to the embedded query. Embeddings, Document """ # noqa: E501 +from langchain.schema.vectorstore import VectorStore from langchain.vectorstores.alibabacloud_opensearch import ( AlibabaCloudOpenSearch, AlibabaCloudOpenSearchSettings, @@ -28,7 +29,6 @@ from langchain.vectorstores.atlas import AtlasDB from langchain.vectorstores.awadb import AwaDB from langchain.vectorstores.azuresearch import AzureSearch from langchain.vectorstores.bageldb import Bagel -from langchain.vectorstores.base import VectorStore from langchain.vectorstores.cassandra import Cassandra from langchain.vectorstores.chroma import Chroma from langchain.vectorstores.clarifai import Clarifai diff --git a/libs/langchain/langchain/vectorstores/alibabacloud_opensearch.py b/libs/langchain/langchain/vectorstores/alibabacloud_opensearch.py index f7fba6c06e..219978b743 100644 --- a/libs/langchain/langchain/vectorstores/alibabacloud_opensearch.py +++ b/libs/langchain/langchain/vectorstores/alibabacloud_opensearch.py @@ -6,7 +6,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple from langchain.schema import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore logger = logging.getLogger() diff --git a/libs/langchain/langchain/vectorstores/analyticdb.py b/libs/langchain/langchain/vectorstores/analyticdb.py index c3b438f059..1792ff6be6 100644 --- a/libs/langchain/langchain/vectorstores/analyticdb.py +++ b/libs/langchain/langchain/vectorstores/analyticdb.py @@ -14,8 +14,8 @@ except ImportError: from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils import get_from_dict_or_env -from langchain.vectorstores.base import VectorStore _LANGCHAIN_DEFAULT_EMBEDDING_DIM = 1536 _LANGCHAIN_DEFAULT_COLLECTION_NAME = "langchain_document" diff --git a/libs/langchain/langchain/vectorstores/annoy.py b/libs/langchain/langchain/vectorstores/annoy.py index 28d7a29573..975c006214 100644 --- a/libs/langchain/langchain/vectorstores/annoy.py +++ b/libs/langchain/langchain/vectorstores/annoy.py @@ -13,7 +13,7 @@ from langchain.docstore.base import Docstore from langchain.docstore.document import Document from langchain.docstore.in_memory import InMemoryDocstore from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance INDEX_METRICS = frozenset(["angular", "euclidean", "manhattan", "hamming", "dot"]) diff --git a/libs/langchain/langchain/vectorstores/atlas.py b/libs/langchain/langchain/vectorstores/atlas.py index 3023181830..15541afccb 100644 --- a/libs/langchain/langchain/vectorstores/atlas.py +++ b/libs/langchain/langchain/vectorstores/atlas.py @@ -8,7 +8,7 @@ import numpy as np from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore logger = logging.getLogger(__name__) diff --git a/libs/langchain/langchain/vectorstores/awadb.py b/libs/langchain/langchain/vectorstores/awadb.py index 2a60ee9a58..258dbed75a 100644 --- a/libs/langchain/langchain/vectorstores/awadb.py +++ b/libs/langchain/langchain/vectorstores/awadb.py @@ -8,7 +8,7 @@ import numpy as np from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance if TYPE_CHECKING: diff --git a/libs/langchain/langchain/vectorstores/azuresearch.py b/libs/langchain/langchain/vectorstores/azuresearch.py index 9883646ecb..84b3db36ff 100644 --- a/libs/langchain/langchain/vectorstores/azuresearch.py +++ b/libs/langchain/langchain/vectorstores/azuresearch.py @@ -26,8 +26,8 @@ from langchain.docstore.document import Document from langchain.pydantic_v1 import root_validator from langchain.schema import BaseRetriever from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils import get_from_env -from langchain.vectorstores.base import VectorStore logger = logging.getLogger() diff --git a/libs/langchain/langchain/vectorstores/bageldb.py b/libs/langchain/langchain/vectorstores/bageldb.py index e8869a41b3..c2119739d4 100644 --- a/libs/langchain/langchain/vectorstores/bageldb.py +++ b/libs/langchain/langchain/vectorstores/bageldb.py @@ -20,8 +20,8 @@ if TYPE_CHECKING: from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils import xor_args -from langchain.vectorstores.base import VectorStore DEFAULT_K = 5 diff --git a/libs/langchain/langchain/vectorstores/base.py b/libs/langchain/langchain/vectorstores/base.py index 2db598837a..05e90ef5f0 100644 --- a/libs/langchain/langchain/vectorstores/base.py +++ b/libs/langchain/langchain/vectorstores/base.py @@ -1,608 +1,3 @@ -from __future__ import annotations +from langchain.schema.vectorstore import VectorStore, VectorStoreRetriever -import asyncio -import logging -import math -import warnings -from abc import ABC, abstractmethod -from functools import partial -from typing import ( - Any, - Callable, - ClassVar, - Collection, - Dict, - Iterable, - List, - Optional, - Tuple, - Type, - TypeVar, -) - -from langchain.callbacks.manager import ( - AsyncCallbackManagerForRetrieverRun, - CallbackManagerForRetrieverRun, -) -from langchain.docstore.document import Document -from langchain.pydantic_v1 import Field, root_validator -from langchain.schema import BaseRetriever -from langchain.schema.embeddings import Embeddings - -logger = logging.getLogger(__name__) - -VST = TypeVar("VST", bound="VectorStore") - - -class VectorStore(ABC): - """Interface for vector store.""" - - @abstractmethod - def add_texts( - self, - texts: Iterable[str], - metadatas: Optional[List[dict]] = None, - **kwargs: Any, - ) -> List[str]: - """Run more texts through the embeddings and add to the vectorstore. - - Args: - texts: Iterable of strings to add to the vectorstore. - metadatas: Optional list of metadatas associated with the texts. - kwargs: vectorstore specific parameters - - Returns: - List of ids from adding the texts into the vectorstore. - """ - - @property - def embeddings(self) -> Optional[Embeddings]: - """Access the query embedding object if available.""" - logger.debug( - f"{Embeddings.__name__} is not implemented for {self.__class__.__name__}" - ) - return None - - def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]: - """Delete by vector ID or other criteria. - - Args: - ids: List of ids to delete. - **kwargs: Other keyword arguments that subclasses might use. - - Returns: - Optional[bool]: True if deletion is successful, - False otherwise, None if not implemented. - """ - - raise NotImplementedError("delete method must be implemented by subclass.") - - async def aadd_texts( - self, - texts: Iterable[str], - metadatas: Optional[List[dict]] = None, - **kwargs: Any, - ) -> List[str]: - """Run more texts through the embeddings and add to the vectorstore.""" - raise NotImplementedError - - def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]: - """Run more documents through the embeddings and add to the vectorstore. - - Args: - documents (List[Document]: Documents to add to the vectorstore. - - Returns: - List[str]: List of IDs of the added texts. - """ - # TODO: Handle the case where the user doesn't provide ids on the Collection - texts = [doc.page_content for doc in documents] - metadatas = [doc.metadata for doc in documents] - return self.add_texts(texts, metadatas, **kwargs) - - async def aadd_documents( - self, documents: List[Document], **kwargs: Any - ) -> List[str]: - """Run more documents through the embeddings and add to the vectorstore. - - Args: - documents (List[Document]: Documents to add to the vectorstore. - - Returns: - List[str]: List of IDs of the added texts. - """ - texts = [doc.page_content for doc in documents] - metadatas = [doc.metadata for doc in documents] - return await self.aadd_texts(texts, metadatas, **kwargs) - - def search(self, query: str, search_type: str, **kwargs: Any) -> List[Document]: - """Return docs most similar to query using specified search type.""" - if search_type == "similarity": - return self.similarity_search(query, **kwargs) - elif search_type == "mmr": - return self.max_marginal_relevance_search(query, **kwargs) - else: - raise ValueError( - f"search_type of {search_type} not allowed. Expected " - "search_type to be 'similarity' or 'mmr'." - ) - - async def asearch( - self, query: str, search_type: str, **kwargs: Any - ) -> List[Document]: - """Return docs most similar to query using specified search type.""" - if search_type == "similarity": - return await self.asimilarity_search(query, **kwargs) - elif search_type == "mmr": - return await self.amax_marginal_relevance_search(query, **kwargs) - else: - raise ValueError( - f"search_type of {search_type} not allowed. Expected " - "search_type to be 'similarity' or 'mmr'." - ) - - @abstractmethod - def similarity_search( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Document]: - """Return docs most similar to query.""" - - @staticmethod - def _euclidean_relevance_score_fn(distance: float) -> float: - """Return a similarity score on a scale [0, 1].""" - # The 'correct' relevance function - # may differ depending on a few things, including: - # - the distance / similarity metric used by the VectorStore - # - the scale of your embeddings (OpenAI's are unit normed. Many - # others are not!) - # - embedding dimensionality - # - etc. - # This function converts the euclidean norm of normalized embeddings - # (0 is most similar, sqrt(2) most dissimilar) - # to a similarity function (0 to 1) - return 1.0 - distance / math.sqrt(2) - - @staticmethod - def _cosine_relevance_score_fn(distance: float) -> float: - """Normalize the distance to a score on a scale [0, 1].""" - - return 1.0 - distance - - @staticmethod - def _max_inner_product_relevance_score_fn(distance: float) -> float: - """Normalize the distance to a score on a scale [0, 1].""" - if distance > 0: - return 1.0 - distance - - return -1.0 * distance - - def _select_relevance_score_fn(self) -> Callable[[float], float]: - """ - The 'correct' relevance function - may differ depending on a few things, including: - - the distance / similarity metric used by the VectorStore - - the scale of your embeddings (OpenAI's are unit normed. Many others are not!) - - embedding dimensionality - - etc. - - Vectorstores should define their own selection based method of relevance. - """ - raise NotImplementedError - - def similarity_search_with_score( - self, *args: Any, **kwargs: Any - ) -> List[Tuple[Document, float]]: - """Run similarity search with distance.""" - raise NotImplementedError - - def _similarity_search_with_relevance_scores( - self, - query: str, - k: int = 4, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """ - Default similarity search with relevance scores. Modify if necessary - in subclass. - Return docs and relevance scores in the range [0, 1]. - - 0 is dissimilar, 1 is most similar. - - Args: - query: input text - k: Number of Documents to return. Defaults to 4. - **kwargs: kwargs to be passed to similarity search. Should include: - score_threshold: Optional, a floating point value between 0 to 1 to - filter the resulting set of retrieved docs - - Returns: - List of Tuples of (doc, similarity_score) - """ - relevance_score_fn = self._select_relevance_score_fn() - docs_and_scores = self.similarity_search_with_score(query, k, **kwargs) - return [(doc, relevance_score_fn(score)) for doc, score in docs_and_scores] - - def similarity_search_with_relevance_scores( - self, - query: str, - k: int = 4, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs and relevance scores in the range [0, 1]. - - 0 is dissimilar, 1 is most similar. - - Args: - query: input text - k: Number of Documents to return. Defaults to 4. - **kwargs: kwargs to be passed to similarity search. Should include: - score_threshold: Optional, a floating point value between 0 to 1 to - filter the resulting set of retrieved docs - - Returns: - List of Tuples of (doc, similarity_score) - """ - score_threshold = kwargs.pop("score_threshold", None) - - docs_and_similarities = self._similarity_search_with_relevance_scores( - query, k=k, **kwargs - ) - if any( - similarity < 0.0 or similarity > 1.0 - for _, similarity in docs_and_similarities - ): - warnings.warn( - "Relevance scores must be between" - f" 0 and 1, got {docs_and_similarities}" - ) - - if score_threshold is not None: - docs_and_similarities = [ - (doc, similarity) - for doc, similarity in docs_and_similarities - if similarity >= score_threshold - ] - if len(docs_and_similarities) == 0: - warnings.warn( - "No relevant docs were retrieved using the relevance score" - f" threshold {score_threshold}" - ) - return docs_and_similarities - - async def asimilarity_search_with_relevance_scores( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Tuple[Document, float]]: - """Return docs most similar to query.""" - - # This is a temporary workaround to make the similarity search - # asynchronous. The proper solution is to make the similarity search - # asynchronous in the vector store implementations. - func = partial( - self.similarity_search_with_relevance_scores, query, k=k, **kwargs - ) - return await asyncio.get_event_loop().run_in_executor(None, func) - - async def asimilarity_search( - self, query: str, k: int = 4, **kwargs: Any - ) -> List[Document]: - """Return docs most similar to query.""" - - # This is a temporary workaround to make the similarity search - # asynchronous. The proper solution is to make the similarity search - # asynchronous in the vector store implementations. - func = partial(self.similarity_search, query, k=k, **kwargs) - return await asyncio.get_event_loop().run_in_executor(None, func) - - def similarity_search_by_vector( - self, embedding: List[float], k: int = 4, **kwargs: Any - ) -> List[Document]: - """Return docs most similar to embedding vector. - - Args: - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - - Returns: - List of Documents most similar to the query vector. - """ - raise NotImplementedError - - async def asimilarity_search_by_vector( - self, embedding: List[float], k: int = 4, **kwargs: Any - ) -> List[Document]: - """Return docs most similar to embedding vector.""" - - # This is a temporary workaround to make the similarity search - # asynchronous. The proper solution is to make the similarity search - # asynchronous in the vector store implementations. - func = partial(self.similarity_search_by_vector, embedding, k=k, **kwargs) - return await asyncio.get_event_loop().run_in_executor(None, func) - - def max_marginal_relevance_search( - self, - query: str, - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - **kwargs: Any, - ) -> List[Document]: - """Return docs selected using the maximal marginal relevance. - - Maximal marginal relevance optimizes for similarity to query AND diversity - among selected documents. - - Args: - query: Text to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch to pass to MMR algorithm. - lambda_mult: Number between 0 and 1 that determines the degree - of diversity among the results with 0 corresponding - to maximum diversity and 1 to minimum diversity. - Defaults to 0.5. - Returns: - List of Documents selected by maximal marginal relevance. - """ - raise NotImplementedError - - async def amax_marginal_relevance_search( - self, - query: str, - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - **kwargs: Any, - ) -> List[Document]: - """Return docs selected using the maximal marginal relevance.""" - - # This is a temporary workaround to make the similarity search - # asynchronous. The proper solution is to make the similarity search - # asynchronous in the vector store implementations. - func = partial( - self.max_marginal_relevance_search, - query, - k=k, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - **kwargs, - ) - return await asyncio.get_event_loop().run_in_executor(None, func) - - def max_marginal_relevance_search_by_vector( - self, - embedding: List[float], - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - **kwargs: Any, - ) -> List[Document]: - """Return docs selected using the maximal marginal relevance. - - Maximal marginal relevance optimizes for similarity to query AND diversity - among selected documents. - - Args: - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch to pass to MMR algorithm. - lambda_mult: Number between 0 and 1 that determines the degree - of diversity among the results with 0 corresponding - to maximum diversity and 1 to minimum diversity. - Defaults to 0.5. - Returns: - List of Documents selected by maximal marginal relevance. - """ - raise NotImplementedError - - async def amax_marginal_relevance_search_by_vector( - self, - embedding: List[float], - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - **kwargs: Any, - ) -> List[Document]: - """Return docs selected using the maximal marginal relevance.""" - raise NotImplementedError - - @classmethod - def from_documents( - cls: Type[VST], - documents: List[Document], - embedding: Embeddings, - **kwargs: Any, - ) -> VST: - """Return VectorStore initialized from documents and embeddings.""" - texts = [d.page_content for d in documents] - metadatas = [d.metadata for d in documents] - return cls.from_texts(texts, embedding, metadatas=metadatas, **kwargs) - - @classmethod - async def afrom_documents( - cls: Type[VST], - documents: List[Document], - embedding: Embeddings, - **kwargs: Any, - ) -> VST: - """Return VectorStore initialized from documents and embeddings.""" - texts = [d.page_content for d in documents] - metadatas = [d.metadata for d in documents] - return await cls.afrom_texts(texts, embedding, metadatas=metadatas, **kwargs) - - @classmethod - @abstractmethod - def from_texts( - cls: Type[VST], - texts: List[str], - embedding: Embeddings, - metadatas: Optional[List[dict]] = None, - **kwargs: Any, - ) -> VST: - """Return VectorStore initialized from texts and embeddings.""" - - @classmethod - async def afrom_texts( - cls: Type[VST], - texts: List[str], - embedding: Embeddings, - metadatas: Optional[List[dict]] = None, - **kwargs: Any, - ) -> VST: - """Return VectorStore initialized from texts and embeddings.""" - raise NotImplementedError - - def _get_retriever_tags(self) -> List[str]: - """Get tags for retriever.""" - tags = [self.__class__.__name__] - if self.embeddings: - tags.append(self.embeddings.__class__.__name__) - return tags - - def as_retriever(self, **kwargs: Any) -> VectorStoreRetriever: - """Return VectorStoreRetriever initialized from this VectorStore. - - Args: - search_type (Optional[str]): Defines the type of search that - the Retriever should perform. - Can be "similarity" (default), "mmr", or - "similarity_score_threshold". - search_kwargs (Optional[Dict]): Keyword arguments to pass to the - search function. Can include things like: - k: Amount of documents to return (Default: 4) - score_threshold: Minimum relevance threshold - for similarity_score_threshold - fetch_k: Amount of documents to pass to MMR algorithm (Default: 20) - lambda_mult: Diversity of results returned by MMR; - 1 for minimum diversity and 0 for maximum. (Default: 0.5) - filter: Filter by document metadata - - Returns: - VectorStoreRetriever: Retriever class for VectorStore. - - Examples: - - .. code-block:: python - - # Retrieve more documents with higher diversity - # Useful if your dataset has many similar documents - docsearch.as_retriever( - search_type="mmr", - search_kwargs={'k': 6, 'lambda_mult': 0.25} - ) - - # Fetch more documents for the MMR algorithm to consider - # But only return the top 5 - docsearch.as_retriever( - search_type="mmr", - search_kwargs={'k': 5, 'fetch_k': 50} - ) - - # Only retrieve documents that have a relevance score - # Above a certain threshold - docsearch.as_retriever( - search_type="similarity_score_threshold", - search_kwargs={'score_threshold': 0.8} - ) - - # Only get the single most similar document from the dataset - docsearch.as_retriever(search_kwargs={'k': 1}) - - # Use a filter to only retrieve documents from a specific paper - docsearch.as_retriever( - search_kwargs={'filter': {'paper_title':'GPT-4 Technical Report'}} - ) - """ - tags = kwargs.pop("tags", None) or [] - tags.extend(self._get_retriever_tags()) - - return VectorStoreRetriever(vectorstore=self, **kwargs, tags=tags) - - -class VectorStoreRetriever(BaseRetriever): - """Base Retriever class for VectorStore.""" - - vectorstore: VectorStore - """VectorStore to use for retrieval.""" - search_type: str = "similarity" - """Type of search to perform. Defaults to "similarity".""" - search_kwargs: dict = Field(default_factory=dict) - """Keyword arguments to pass to the search function.""" - allowed_search_types: ClassVar[Collection[str]] = ( - "similarity", - "similarity_score_threshold", - "mmr", - ) - - class Config: - """Configuration for this pydantic object.""" - - arbitrary_types_allowed = True - - @root_validator() - def validate_search_type(cls, values: Dict) -> Dict: - """Validate search type.""" - search_type = values["search_type"] - if search_type not in cls.allowed_search_types: - raise ValueError( - f"search_type of {search_type} not allowed. Valid values are: " - f"{cls.allowed_search_types}" - ) - if search_type == "similarity_score_threshold": - score_threshold = values["search_kwargs"].get("score_threshold") - if (score_threshold is None) or (not isinstance(score_threshold, float)): - raise ValueError( - "`score_threshold` is not specified with a float value(0~1) " - "in `search_kwargs`." - ) - return values - - def _get_relevant_documents( - self, query: str, *, run_manager: CallbackManagerForRetrieverRun - ) -> List[Document]: - if self.search_type == "similarity": - docs = self.vectorstore.similarity_search(query, **self.search_kwargs) - elif self.search_type == "similarity_score_threshold": - docs_and_similarities = ( - self.vectorstore.similarity_search_with_relevance_scores( - query, **self.search_kwargs - ) - ) - docs = [doc for doc, _ in docs_and_similarities] - elif self.search_type == "mmr": - docs = self.vectorstore.max_marginal_relevance_search( - query, **self.search_kwargs - ) - else: - raise ValueError(f"search_type of {self.search_type} not allowed.") - return docs - - async def _aget_relevant_documents( - self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun - ) -> List[Document]: - if self.search_type == "similarity": - docs = await self.vectorstore.asimilarity_search( - query, **self.search_kwargs - ) - elif self.search_type == "similarity_score_threshold": - docs_and_similarities = ( - await self.vectorstore.asimilarity_search_with_relevance_scores( - query, **self.search_kwargs - ) - ) - docs = [doc for doc, _ in docs_and_similarities] - elif self.search_type == "mmr": - docs = await self.vectorstore.amax_marginal_relevance_search( - query, **self.search_kwargs - ) - else: - raise ValueError(f"search_type of {self.search_type} not allowed.") - return docs - - def add_documents(self, documents: List[Document], **kwargs: Any) -> List[str]: - """Add documents to vectorstore.""" - return self.vectorstore.add_documents(documents, **kwargs) - - async def aadd_documents( - self, documents: List[Document], **kwargs: Any - ) -> List[str]: - """Add documents to vectorstore.""" - return await self.vectorstore.aadd_documents(documents, **kwargs) +__all__ = ["VectorStore", "VectorStoreRetriever"] diff --git a/libs/langchain/langchain/vectorstores/cassandra.py b/libs/langchain/langchain/vectorstores/cassandra.py index c8275ee97c..d57c05cf86 100644 --- a/libs/langchain/langchain/vectorstores/cassandra.py +++ b/libs/langchain/langchain/vectorstores/cassandra.py @@ -22,7 +22,7 @@ if typing.TYPE_CHECKING: from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance CVST = TypeVar("CVST", bound="Cassandra") diff --git a/libs/langchain/langchain/vectorstores/chroma.py b/libs/langchain/langchain/vectorstores/chroma.py index faa64968dd..c666e60c64 100644 --- a/libs/langchain/langchain/vectorstores/chroma.py +++ b/libs/langchain/langchain/vectorstores/chroma.py @@ -18,8 +18,8 @@ import numpy as np from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils import xor_args -from langchain.vectorstores.base import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance if TYPE_CHECKING: diff --git a/libs/langchain/langchain/vectorstores/clarifai.py b/libs/langchain/langchain/vectorstores/clarifai.py index 0b2cdfeeca..1d92c7e5cf 100644 --- a/libs/langchain/langchain/vectorstores/clarifai.py +++ b/libs/langchain/langchain/vectorstores/clarifai.py @@ -10,7 +10,7 @@ import requests from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore logger = logging.getLogger(__name__) diff --git a/libs/langchain/langchain/vectorstores/clickhouse.py b/libs/langchain/langchain/vectorstores/clickhouse.py index 9564544831..76266a672c 100644 --- a/libs/langchain/langchain/vectorstores/clickhouse.py +++ b/libs/langchain/langchain/vectorstores/clickhouse.py @@ -9,7 +9,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Union from langchain.docstore.document import Document from langchain.pydantic_v1 import BaseSettings from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore logger = logging.getLogger() diff --git a/libs/langchain/langchain/vectorstores/dashvector.py b/libs/langchain/langchain/vectorstores/dashvector.py index d07c6732b8..618b2be452 100644 --- a/libs/langchain/langchain/vectorstores/dashvector.py +++ b/libs/langchain/langchain/vectorstores/dashvector.py @@ -14,8 +14,8 @@ import numpy as np from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils import get_from_env -from langchain.vectorstores.base import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance logger = logging.getLogger(__name__) diff --git a/libs/langchain/langchain/vectorstores/deeplake.py b/libs/langchain/langchain/vectorstores/deeplake.py index 6b45d8d615..501b9bbbd5 100644 --- a/libs/langchain/langchain/vectorstores/deeplake.py +++ b/libs/langchain/langchain/vectorstores/deeplake.py @@ -16,7 +16,7 @@ except ImportError: from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance logger = logging.getLogger(__name__) diff --git a/libs/langchain/langchain/vectorstores/dingo.py b/libs/langchain/langchain/vectorstores/dingo.py index 85776878cd..256fdea93d 100644 --- a/libs/langchain/langchain/vectorstores/dingo.py +++ b/libs/langchain/langchain/vectorstores/dingo.py @@ -8,7 +8,7 @@ import numpy as np from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance logger = logging.getLogger(__name__) diff --git a/libs/langchain/langchain/vectorstores/elastic_vector_search.py b/libs/langchain/langchain/vectorstores/elastic_vector_search.py index 6db4472950..2ef39d1e96 100644 --- a/libs/langchain/langchain/vectorstores/elastic_vector_search.py +++ b/libs/langchain/langchain/vectorstores/elastic_vector_search.py @@ -18,8 +18,8 @@ from typing import ( from langchain._api import deprecated from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils import get_from_dict_or_env -from langchain.vectorstores.base import VectorStore if TYPE_CHECKING: from elasticsearch import Elasticsearch diff --git a/libs/langchain/langchain/vectorstores/elasticsearch.py b/libs/langchain/langchain/vectorstores/elasticsearch.py index 7a47dfbd40..0066d5071d 100644 --- a/libs/langchain/langchain/vectorstores/elasticsearch.py +++ b/libs/langchain/langchain/vectorstores/elasticsearch.py @@ -16,7 +16,7 @@ from typing import ( from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore from langchain.vectorstores.utils import DistanceStrategy if TYPE_CHECKING: diff --git a/libs/langchain/langchain/vectorstores/epsilla.py b/libs/langchain/langchain/vectorstores/epsilla.py index 380d8dffca..94521513ce 100644 --- a/libs/langchain/langchain/vectorstores/epsilla.py +++ b/libs/langchain/langchain/vectorstores/epsilla.py @@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Type from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore if TYPE_CHECKING: from pyepsilla import vectordb diff --git a/libs/langchain/langchain/vectorstores/faiss.py b/libs/langchain/langchain/vectorstores/faiss.py index b796b8b903..2fc3bb68eb 100644 --- a/libs/langchain/langchain/vectorstores/faiss.py +++ b/libs/langchain/langchain/vectorstores/faiss.py @@ -23,7 +23,7 @@ from langchain.docstore.base import AddableMixin, Docstore from langchain.docstore.document import Document from langchain.docstore.in_memory import InMemoryDocstore from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore from langchain.vectorstores.utils import DistanceStrategy, maximal_marginal_relevance diff --git a/libs/langchain/langchain/vectorstores/hologres.py b/libs/langchain/langchain/vectorstores/hologres.py index b633e517c1..20a9c254fe 100644 --- a/libs/langchain/langchain/vectorstores/hologres.py +++ b/libs/langchain/langchain/vectorstores/hologres.py @@ -7,8 +7,8 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Type from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils import get_from_dict_or_env -from langchain.vectorstores.base import VectorStore ADA_TOKEN_COUNT = 1536 _LANGCHAIN_DEFAULT_TABLE_NAME = "langchain_pg_embedding" diff --git a/libs/langchain/langchain/vectorstores/lancedb.py b/libs/langchain/langchain/vectorstores/lancedb.py index a06b93208b..c396ef648b 100644 --- a/libs/langchain/langchain/vectorstores/lancedb.py +++ b/libs/langchain/langchain/vectorstores/lancedb.py @@ -5,7 +5,7 @@ from typing import Any, Iterable, List, Optional from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore class LanceDB(VectorStore): diff --git a/libs/langchain/langchain/vectorstores/llm_rails.py b/libs/langchain/langchain/vectorstores/llm_rails.py index fc9b105609..cf6c8558a1 100644 --- a/libs/langchain/langchain/vectorstores/llm_rails.py +++ b/libs/langchain/langchain/vectorstores/llm_rails.py @@ -13,7 +13,7 @@ import requests from langchain.pydantic_v1 import Field from langchain.schema import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore, VectorStoreRetriever +from langchain.schema.vectorstore import VectorStore, VectorStoreRetriever class ModelChoices(str, Enum): diff --git a/libs/langchain/langchain/vectorstores/marqo.py b/libs/langchain/langchain/vectorstores/marqo.py index 060695af85..261e14fd08 100644 --- a/libs/langchain/langchain/vectorstores/marqo.py +++ b/libs/langchain/langchain/vectorstores/marqo.py @@ -17,7 +17,7 @@ from typing import ( from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore if TYPE_CHECKING: import marqo diff --git a/libs/langchain/langchain/vectorstores/matching_engine.py b/libs/langchain/langchain/vectorstores/matching_engine.py index c5f1286615..1e197fe819 100644 --- a/libs/langchain/langchain/vectorstores/matching_engine.py +++ b/libs/langchain/langchain/vectorstores/matching_engine.py @@ -9,7 +9,7 @@ from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Type from langchain.docstore.document import Document from langchain.embeddings import TensorflowHubEmbeddings from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore if TYPE_CHECKING: from google.cloud import storage diff --git a/libs/langchain/langchain/vectorstores/meilisearch.py b/libs/langchain/langchain/vectorstores/meilisearch.py index 0f6ae827c4..e0e8c2846d 100644 --- a/libs/langchain/langchain/vectorstores/meilisearch.py +++ b/libs/langchain/langchain/vectorstores/meilisearch.py @@ -5,8 +5,8 @@ from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Ty from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils import get_from_env -from langchain.vectorstores.base import VectorStore if TYPE_CHECKING: from meilisearch import Client diff --git a/libs/langchain/langchain/vectorstores/milvus.py b/libs/langchain/langchain/vectorstores/milvus.py index aac290ebd1..53fbef6116 100644 --- a/libs/langchain/langchain/vectorstores/milvus.py +++ b/libs/langchain/langchain/vectorstores/milvus.py @@ -8,7 +8,7 @@ import numpy as np from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance logger = logging.getLogger(__name__) diff --git a/libs/langchain/langchain/vectorstores/mongodb_atlas.py b/libs/langchain/langchain/vectorstores/mongodb_atlas.py index b3f15a1dee..ef13747311 100644 --- a/libs/langchain/langchain/vectorstores/mongodb_atlas.py +++ b/libs/langchain/langchain/vectorstores/mongodb_atlas.py @@ -18,7 +18,7 @@ import numpy as np from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance if TYPE_CHECKING: diff --git a/libs/langchain/langchain/vectorstores/myscale.py b/libs/langchain/langchain/vectorstores/myscale.py index 6b6e208a36..e4f461273c 100644 --- a/libs/langchain/langchain/vectorstores/myscale.py +++ b/libs/langchain/langchain/vectorstores/myscale.py @@ -9,7 +9,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple from langchain.docstore.document import Document from langchain.pydantic_v1 import BaseSettings from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore logger = logging.getLogger() diff --git a/libs/langchain/langchain/vectorstores/neo4j_vector.py b/libs/langchain/langchain/vectorstores/neo4j_vector.py index 59d88b5a02..5e2184cdb5 100644 --- a/libs/langchain/langchain/vectorstores/neo4j_vector.py +++ b/libs/langchain/langchain/vectorstores/neo4j_vector.py @@ -16,8 +16,8 @@ from typing import ( from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils import get_from_env -from langchain.vectorstores.base import VectorStore from langchain.vectorstores.utils import DistanceStrategy DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.COSINE diff --git a/libs/langchain/langchain/vectorstores/nucliadb.py b/libs/langchain/langchain/vectorstores/nucliadb.py index 6f9513a66c..de4537ca4e 100644 --- a/libs/langchain/langchain/vectorstores/nucliadb.py +++ b/libs/langchain/langchain/vectorstores/nucliadb.py @@ -3,7 +3,7 @@ from typing import Any, Dict, Iterable, List, Optional, Type from langchain.schema.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VST, VectorStore +from langchain.schema.vectorstore import VST, VectorStore FIELD_TYPES = { "f": "files", diff --git a/libs/langchain/langchain/vectorstores/opensearch_vector_search.py b/libs/langchain/langchain/vectorstores/opensearch_vector_search.py index bb63c2f806..0c3117ba58 100644 --- a/libs/langchain/langchain/vectorstores/opensearch_vector_search.py +++ b/libs/langchain/langchain/vectorstores/opensearch_vector_search.py @@ -8,8 +8,8 @@ import numpy as np from langchain.schema import Document from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils import get_from_dict_or_env -from langchain.vectorstores.base import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance IMPORT_OPENSEARCH_PY_ERROR = ( diff --git a/libs/langchain/langchain/vectorstores/pgembedding.py b/libs/langchain/langchain/vectorstores/pgembedding.py index d2ee868b8b..7a3df62c29 100644 --- a/libs/langchain/langchain/vectorstores/pgembedding.py +++ b/libs/langchain/langchain/vectorstores/pgembedding.py @@ -11,8 +11,8 @@ from sqlalchemy.orm import Session, declarative_base, relationship from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils import get_from_dict_or_env -from langchain.vectorstores.base import VectorStore Base = declarative_base() # type: Any diff --git a/libs/langchain/langchain/vectorstores/pgvector.py b/libs/langchain/langchain/vectorstores/pgvector.py index 7091a7e8a3..f418f852ca 100644 --- a/libs/langchain/langchain/vectorstores/pgvector.py +++ b/libs/langchain/langchain/vectorstores/pgvector.py @@ -27,8 +27,8 @@ from sqlalchemy.orm import Session, declarative_base from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils import get_from_dict_or_env -from langchain.vectorstores.base import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance if TYPE_CHECKING: diff --git a/libs/langchain/langchain/vectorstores/pinecone.py b/libs/langchain/langchain/vectorstores/pinecone.py index 25e0f161f5..bdce378638 100644 --- a/libs/langchain/langchain/vectorstores/pinecone.py +++ b/libs/langchain/langchain/vectorstores/pinecone.py @@ -9,8 +9,8 @@ import numpy as np from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils.iter import batch_iterate -from langchain.vectorstores.base import VectorStore from langchain.vectorstores.utils import DistanceStrategy, maximal_marginal_relevance if TYPE_CHECKING: diff --git a/libs/langchain/langchain/vectorstores/redis/base.py b/libs/langchain/langchain/vectorstores/redis/base.py index 3e4c31dd57..8409f4fa55 100644 --- a/libs/langchain/langchain/vectorstores/redis/base.py +++ b/libs/langchain/langchain/vectorstores/redis/base.py @@ -30,6 +30,7 @@ from langchain.callbacks.manager import ( ) from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore, VectorStoreRetriever from langchain.utilities.redis import ( _array_to_buffer, _buffer_to_array, @@ -37,7 +38,6 @@ from langchain.utilities.redis import ( get_client, ) from langchain.utils import get_from_dict_or_env -from langchain.vectorstores.base import VectorStore, VectorStoreRetriever from langchain.vectorstores.redis.constants import ( REDIS_REQUIRED_MODULES, REDIS_TAG_SEPARATOR, diff --git a/libs/langchain/langchain/vectorstores/rocksetdb.py b/libs/langchain/langchain/vectorstores/rocksetdb.py index d825453e88..87410cebec 100644 --- a/libs/langchain/langchain/vectorstores/rocksetdb.py +++ b/libs/langchain/langchain/vectorstores/rocksetdb.py @@ -6,7 +6,7 @@ from typing import Any, Iterable, List, Optional, Tuple from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore logger = logging.getLogger(__name__) diff --git a/libs/langchain/langchain/vectorstores/scann.py b/libs/langchain/langchain/vectorstores/scann.py index 01608a89de..9e73031408 100644 --- a/libs/langchain/langchain/vectorstores/scann.py +++ b/libs/langchain/langchain/vectorstores/scann.py @@ -12,7 +12,7 @@ from langchain.docstore.base import AddableMixin, Docstore from langchain.docstore.document import Document from langchain.docstore.in_memory import InMemoryDocstore from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore from langchain.vectorstores.utils import DistanceStrategy diff --git a/libs/langchain/langchain/vectorstores/singlestoredb.py b/libs/langchain/langchain/vectorstores/singlestoredb.py index 761658ff43..4e41269fef 100644 --- a/libs/langchain/langchain/vectorstores/singlestoredb.py +++ b/libs/langchain/langchain/vectorstores/singlestoredb.py @@ -21,7 +21,7 @@ from langchain.callbacks.manager import ( ) from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore, VectorStoreRetriever +from langchain.schema.vectorstore import VectorStore, VectorStoreRetriever from langchain.vectorstores.utils import DistanceStrategy DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.DOT_PRODUCT diff --git a/libs/langchain/langchain/vectorstores/sklearn.py b/libs/langchain/langchain/vectorstores/sklearn.py index eb270279d3..32224e9b10 100644 --- a/libs/langchain/langchain/vectorstores/sklearn.py +++ b/libs/langchain/langchain/vectorstores/sklearn.py @@ -12,8 +12,8 @@ from uuid import uuid4 from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils import guard_import -from langchain.vectorstores.base import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance DEFAULT_K = 4 # Number of Documents to return. diff --git a/libs/langchain/langchain/vectorstores/sqlitevss.py b/libs/langchain/langchain/vectorstores/sqlitevss.py index 8e0e57ac9e..fcc4157b26 100644 --- a/libs/langchain/langchain/vectorstores/sqlitevss.py +++ b/libs/langchain/langchain/vectorstores/sqlitevss.py @@ -15,7 +15,7 @@ from typing import ( from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore if TYPE_CHECKING: import sqlite3 diff --git a/libs/langchain/langchain/vectorstores/starrocks.py b/libs/langchain/langchain/vectorstores/starrocks.py index fd0abe5073..04dbec591e 100644 --- a/libs/langchain/langchain/vectorstores/starrocks.py +++ b/libs/langchain/langchain/vectorstores/starrocks.py @@ -9,7 +9,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple from langchain.docstore.document import Document from langchain.pydantic_v1 import BaseSettings from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore logger = logging.getLogger() DEBUG = False diff --git a/libs/langchain/langchain/vectorstores/supabase.py b/libs/langchain/langchain/vectorstores/supabase.py index 437f304da6..2cb04385ae 100644 --- a/libs/langchain/langchain/vectorstores/supabase.py +++ b/libs/langchain/langchain/vectorstores/supabase.py @@ -18,7 +18,7 @@ import numpy as np from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance if TYPE_CHECKING: diff --git a/libs/langchain/langchain/vectorstores/tair.py b/libs/langchain/langchain/vectorstores/tair.py index 6efeae34d0..0e9d29023a 100644 --- a/libs/langchain/langchain/vectorstores/tair.py +++ b/libs/langchain/langchain/vectorstores/tair.py @@ -7,8 +7,8 @@ from typing import Any, Iterable, List, Optional, Type from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils import get_from_dict_or_env -from langchain.vectorstores.base import VectorStore logger = logging.getLogger(__name__) diff --git a/libs/langchain/langchain/vectorstores/tencentvectordb.py b/libs/langchain/langchain/vectorstores/tencentvectordb.py index bd6db87c9b..fe77390c72 100644 --- a/libs/langchain/langchain/vectorstores/tencentvectordb.py +++ b/libs/langchain/langchain/vectorstores/tencentvectordb.py @@ -10,8 +10,8 @@ import numpy as np from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils import guard_import -from langchain.vectorstores.base import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance logger = logging.getLogger(__name__) diff --git a/libs/langchain/langchain/vectorstores/timescalevector.py b/libs/langchain/langchain/vectorstores/timescalevector.py index a25cb97c5a..4f455e2b9e 100644 --- a/libs/langchain/langchain/vectorstores/timescalevector.py +++ b/libs/langchain/langchain/vectorstores/timescalevector.py @@ -20,8 +20,8 @@ from typing import ( from langchain.docstore.document import Document from langchain.embeddings.base import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils import get_from_dict_or_env -from langchain.vectorstores.base import VectorStore from langchain.vectorstores.utils import DistanceStrategy if TYPE_CHECKING: diff --git a/libs/langchain/langchain/vectorstores/typesense.py b/libs/langchain/langchain/vectorstores/typesense.py index 5ef4627f50..622cdb5a4e 100644 --- a/libs/langchain/langchain/vectorstores/typesense.py +++ b/libs/langchain/langchain/vectorstores/typesense.py @@ -5,8 +5,8 @@ from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Union from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings +from langchain.schema.vectorstore import VectorStore from langchain.utils import get_from_env -from langchain.vectorstores.base import VectorStore if TYPE_CHECKING: from typesense.client import Client diff --git a/libs/langchain/langchain/vectorstores/usearch.py b/libs/langchain/langchain/vectorstores/usearch.py index 946bcd40e4..f44fc31ba6 100644 --- a/libs/langchain/langchain/vectorstores/usearch.py +++ b/libs/langchain/langchain/vectorstores/usearch.py @@ -8,7 +8,7 @@ from langchain.docstore.base import AddableMixin, Docstore from langchain.docstore.document import Document from langchain.docstore.in_memory import InMemoryDocstore from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore def dependable_usearch_import() -> Any: diff --git a/libs/langchain/langchain/vectorstores/vald.py b/libs/langchain/langchain/vectorstores/vald.py index 048c437543..560515a527 100644 --- a/libs/langchain/langchain/vectorstores/vald.py +++ b/libs/langchain/langchain/vectorstores/vald.py @@ -7,7 +7,7 @@ import numpy as np from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance diff --git a/libs/langchain/langchain/vectorstores/vearch.py b/libs/langchain/langchain/vectorstores/vearch.py index 85c25a6f8e..11cede24e1 100644 --- a/libs/langchain/langchain/vectorstores/vearch.py +++ b/libs/langchain/langchain/vectorstores/vearch.py @@ -9,7 +9,7 @@ import numpy as np from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore if TYPE_CHECKING: import vearch diff --git a/libs/langchain/langchain/vectorstores/vectara.py b/libs/langchain/langchain/vectorstores/vectara.py index 735fd915e5..8804b01283 100644 --- a/libs/langchain/langchain/vectorstores/vectara.py +++ b/libs/langchain/langchain/vectorstores/vectara.py @@ -11,7 +11,7 @@ import requests from langchain.pydantic_v1 import Field from langchain.schema import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore, VectorStoreRetriever +from langchain.schema.vectorstore import VectorStore, VectorStoreRetriever logger = logging.getLogger(__name__) diff --git a/libs/langchain/langchain/vectorstores/weaviate.py b/libs/langchain/langchain/vectorstores/weaviate.py index 49a1d8f58c..bf515f055e 100644 --- a/libs/langchain/langchain/vectorstores/weaviate.py +++ b/libs/langchain/langchain/vectorstores/weaviate.py @@ -18,7 +18,7 @@ import numpy as np from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance if TYPE_CHECKING: diff --git a/libs/langchain/langchain/vectorstores/xata.py b/libs/langchain/langchain/vectorstores/xata.py index 34076a90aa..d031a21e79 100644 --- a/libs/langchain/langchain/vectorstores/xata.py +++ b/libs/langchain/langchain/vectorstores/xata.py @@ -6,7 +6,7 @@ from typing import Any, Dict, Iterable, List, Optional, Tuple, Type from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore class XataVectorStore(VectorStore): diff --git a/libs/langchain/langchain/vectorstores/zep.py b/libs/langchain/langchain/vectorstores/zep.py index 28783ea431..a55ad08ed1 100644 --- a/libs/langchain/langchain/vectorstores/zep.py +++ b/libs/langchain/langchain/vectorstores/zep.py @@ -9,7 +9,7 @@ import numpy as np from langchain.docstore.document import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore from langchain.vectorstores.utils import maximal_marginal_relevance if TYPE_CHECKING: diff --git a/libs/langchain/tests/unit_tests/indexes/test_indexing.py b/libs/langchain/tests/unit_tests/indexes/test_indexing.py index 9e4a59e1b6..b3e3821b2f 100644 --- a/libs/langchain/tests/unit_tests/indexes/test_indexing.py +++ b/libs/langchain/tests/unit_tests/indexes/test_indexing.py @@ -9,7 +9,7 @@ from langchain.indexes import index from langchain.indexes._sql_record_manager import SQLRecordManager from langchain.schema import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VST, VectorStore +from langchain.schema.vectorstore import VST, VectorStore class ToyLoader(BaseLoader): diff --git a/libs/langchain/tests/unit_tests/retrievers/test_time_weighted_retriever.py b/libs/langchain/tests/unit_tests/retrievers/test_time_weighted_retriever.py index dd9058d881..abe220441a 100644 --- a/libs/langchain/tests/unit_tests/retrievers/test_time_weighted_retriever.py +++ b/libs/langchain/tests/unit_tests/retrievers/test_time_weighted_retriever.py @@ -11,7 +11,7 @@ from langchain.retrievers.time_weighted_retriever import ( ) from langchain.schema import Document from langchain.schema.embeddings import Embeddings -from langchain.vectorstores.base import VectorStore +from langchain.schema.vectorstore import VectorStore def _get_example_memories(k: int = 4) -> List[Document]: