mirror of
https://github.com/hwchase17/langchain
synced 2024-11-04 06:00:26 +00:00
Add support for MongoDB Atlas $vectorSearch vector search (#11139)
Adds support for the `$vectorSearch` operator for MongoDBAtlasVectorSearch, which was announced at .Local London (September 26th, 2023). This change maintains breaks compatibility support for the existing `$search` operator used by the original integration (https://github.com/langchain-ai/langchain/pull/5338) due to incompatibilities in the Atlas search implementations. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
b599f91e33
commit
2c952de21a
@ -89,6 +89,18 @@ class MongoDBAtlasVectorSearch(VectorStore):
|
|||||||
embedding: Embeddings,
|
embedding: Embeddings,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> MongoDBAtlasVectorSearch:
|
) -> MongoDBAtlasVectorSearch:
|
||||||
|
"""Construct a `MongoDB Atlas Vector Search` vector store
|
||||||
|
from a MongoDB connection URI.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
connection_string: A valid MongoDB connection URI.
|
||||||
|
namespace: A valid MongoDB namespace (database and collection).
|
||||||
|
embedding: The text embedding model to use for the vector store.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A new MongoDBAtlasVectorSearch instance.
|
||||||
|
|
||||||
|
"""
|
||||||
try:
|
try:
|
||||||
from pymongo import MongoClient
|
from pymongo import MongoClient
|
||||||
except ImportError:
|
except ImportError:
|
||||||
@ -149,24 +161,23 @@ class MongoDBAtlasVectorSearch(VectorStore):
|
|||||||
self,
|
self,
|
||||||
embedding: List[float],
|
embedding: List[float],
|
||||||
k: int = 4,
|
k: int = 4,
|
||||||
pre_filter: Optional[dict] = None,
|
pre_filter: Optional[Dict] = None,
|
||||||
post_filter_pipeline: Optional[List[Dict]] = None,
|
post_filter_pipeline: Optional[List[Dict]] = None,
|
||||||
) -> List[Tuple[Document, float]]:
|
) -> List[Tuple[Document, float]]:
|
||||||
knn_beta = {
|
params = {
|
||||||
"vector": embedding,
|
"queryVector": embedding,
|
||||||
"path": self._embedding_key,
|
"path": self._embedding_key,
|
||||||
"k": k,
|
"numCandidates": k * 10,
|
||||||
|
"limit": k,
|
||||||
|
"index": self._index_name,
|
||||||
}
|
}
|
||||||
if pre_filter:
|
if pre_filter:
|
||||||
knn_beta["filter"] = pre_filter
|
params["filter"] = pre_filter
|
||||||
|
query = {"$vectorSearch": params}
|
||||||
|
|
||||||
pipeline = [
|
pipeline = [
|
||||||
{
|
query,
|
||||||
"$search": {
|
{"$set": {"score": {"$meta": "vectorSearchScore"}}},
|
||||||
"index": self._index_name,
|
|
||||||
"knnBeta": knn_beta,
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{"$set": {"score": {"$meta": "searchScore"}}},
|
|
||||||
]
|
]
|
||||||
if post_filter_pipeline is not None:
|
if post_filter_pipeline is not None:
|
||||||
pipeline.extend(post_filter_pipeline)
|
pipeline.extend(post_filter_pipeline)
|
||||||
@ -183,12 +194,12 @@ class MongoDBAtlasVectorSearch(VectorStore):
|
|||||||
query: str,
|
query: str,
|
||||||
*,
|
*,
|
||||||
k: int = 4,
|
k: int = 4,
|
||||||
pre_filter: Optional[dict] = None,
|
pre_filter: Optional[Dict] = None,
|
||||||
post_filter_pipeline: Optional[List[Dict]] = None,
|
post_filter_pipeline: Optional[List[Dict]] = None,
|
||||||
) -> List[Tuple[Document, float]]:
|
) -> List[Tuple[Document, float]]:
|
||||||
"""Return MongoDB documents most similar to query, along with scores.
|
"""Return MongoDB documents most similar to the given query and their scores.
|
||||||
|
|
||||||
Use the knnBeta Operator available in MongoDB Atlas Search
|
Uses the knnBeta Operator available in MongoDB Atlas Search.
|
||||||
This feature is in early access and available only for evaluation purposes, to
|
This feature is in early access and available only for evaluation purposes, to
|
||||||
validate functionality, and to gather feedback from a small closed group of
|
validate functionality, and to gather feedback from a small closed group of
|
||||||
early access users. It is not recommended for production deployments as we
|
early access users. It is not recommended for production deployments as we
|
||||||
@ -197,14 +208,14 @@ class MongoDBAtlasVectorSearch(VectorStore):
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
query: Text to look up documents similar to.
|
query: Text to look up documents similar to.
|
||||||
k: Optional Number of Documents to return. Defaults to 4.
|
k: (Optional) number of documents to return. Defaults to 4.
|
||||||
pre_filter: Optional Dictionary of argument(s) to prefilter on document
|
pre_filter: (Optional) dictionary of argument(s) to prefilter document
|
||||||
fields.
|
fields on.
|
||||||
post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages
|
post_filter_pipeline: (Optional) Pipeline of MongoDB aggregation stages
|
||||||
following the knnBeta search.
|
following the knnBeta vector search.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of Documents most similar to the query and score for each
|
List of documents most similar to the query and their scores.
|
||||||
"""
|
"""
|
||||||
embedding = self._embedding.embed_query(query)
|
embedding = self._embedding.embed_query(query)
|
||||||
docs = self._similarity_search_with_score(
|
docs = self._similarity_search_with_score(
|
||||||
@ -219,29 +230,29 @@ class MongoDBAtlasVectorSearch(VectorStore):
|
|||||||
self,
|
self,
|
||||||
query: str,
|
query: str,
|
||||||
k: int = 4,
|
k: int = 4,
|
||||||
pre_filter: Optional[dict] = None,
|
pre_filter: Optional[Dict] = None,
|
||||||
post_filter_pipeline: Optional[List[Dict]] = None,
|
post_filter_pipeline: Optional[List[Dict]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
"""Return MongoDB documents most similar to query.
|
"""Return MongoDB documents most similar to the given query.
|
||||||
|
|
||||||
Use the knnBeta Operator available in MongoDB Atlas Search
|
Uses the knnBeta Operator available in MongoDB Atlas Search.
|
||||||
This feature is in early access and available only for evaluation purposes, to
|
This feature is in early access and available only for evaluation purposes, to
|
||||||
validate functionality, and to gather feedback from a small closed group of
|
validate functionality, and to gather feedback from a small closed group of
|
||||||
early access users. It is not recommended for production deployments as we may
|
early access users. It is not recommended for production deployments as we
|
||||||
introduce breaking changes.
|
may introduce breaking changes.
|
||||||
For more: https://www.mongodb.com/docs/atlas/atlas-search/knn-beta
|
For more: https://www.mongodb.com/docs/atlas/atlas-search/knn-beta
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
query: Text to look up documents similar to.
|
query: Text to look up documents similar to.
|
||||||
k: Optional Number of Documents to return. Defaults to 4.
|
k: (Optional) number of documents to return. Defaults to 4.
|
||||||
pre_filter: Optional Dictionary of argument(s) to prefilter on document
|
pre_filter: (Optional) dictionary of argument(s) to prefilter document
|
||||||
fields.
|
fields on.
|
||||||
post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages
|
post_filter_pipeline: (Optional) Pipeline of MongoDB aggregation stages
|
||||||
following the knnBeta search.
|
following the knnBeta vector search.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
List of Documents most similar to the query and score for each
|
List of documents most similar to the query and their scores.
|
||||||
"""
|
"""
|
||||||
docs_and_scores = self.similarity_search_with_score(
|
docs_and_scores = self.similarity_search_with_score(
|
||||||
query,
|
query,
|
||||||
@ -257,30 +268,30 @@ class MongoDBAtlasVectorSearch(VectorStore):
|
|||||||
k: int = 4,
|
k: int = 4,
|
||||||
fetch_k: int = 20,
|
fetch_k: int = 20,
|
||||||
lambda_mult: float = 0.5,
|
lambda_mult: float = 0.5,
|
||||||
pre_filter: Optional[dict] = None,
|
pre_filter: Optional[Dict] = None,
|
||||||
post_filter_pipeline: Optional[List[Dict]] = None,
|
post_filter_pipeline: Optional[List[Dict]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> List[Document]:
|
) -> List[Document]:
|
||||||
"""Return docs selected using the maximal marginal relevance.
|
"""Return documents selected using the maximal marginal relevance.
|
||||||
|
|
||||||
Maximal marginal relevance optimizes for similarity to query AND diversity
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
||||||
among selected documents.
|
among selected documents.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
query: Text to look up documents similar to.
|
query: Text to look up documents similar to.
|
||||||
k: Optional Number of Documents to return. Defaults to 4.
|
k: (Optional) number of documents to return. Defaults to 4.
|
||||||
fetch_k: Optional Number of Documents to fetch before passing to MMR
|
fetch_k: (Optional) number of documents to fetch before passing to MMR
|
||||||
algorithm. Defaults to 20.
|
algorithm. Defaults to 20.
|
||||||
lambda_mult: Number between 0 and 1 that determines the degree
|
lambda_mult: Number between 0 and 1 that determines the degree
|
||||||
of diversity among the results with 0 corresponding
|
of diversity among the results with 0 corresponding
|
||||||
to maximum diversity and 1 to minimum diversity.
|
to maximum diversity and 1 to minimum diversity.
|
||||||
Defaults to 0.5.
|
Defaults to 0.5.
|
||||||
pre_filter: Optional Dictionary of argument(s) to prefilter on document
|
pre_filter: (Optional) dictionary of argument(s) to prefilter on document
|
||||||
fields.
|
fields.
|
||||||
post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages
|
post_filter_pipeline: (Optional) pipeline of MongoDB aggregation stages
|
||||||
following the knnBeta search.
|
following the knnBeta vector search.
|
||||||
Returns:
|
Returns:
|
||||||
List of Documents selected by maximal marginal relevance.
|
List of documents selected by maximal marginal relevance.
|
||||||
"""
|
"""
|
||||||
query_embedding = self._embedding.embed_query(query)
|
query_embedding = self._embedding.embed_query(query)
|
||||||
docs = self._similarity_search_with_score(
|
docs = self._similarity_search_with_score(
|
||||||
@ -303,11 +314,11 @@ class MongoDBAtlasVectorSearch(VectorStore):
|
|||||||
cls,
|
cls,
|
||||||
texts: List[str],
|
texts: List[str],
|
||||||
embedding: Embeddings,
|
embedding: Embeddings,
|
||||||
metadatas: Optional[List[dict]] = None,
|
metadatas: Optional[List[Dict]] = None,
|
||||||
collection: Optional[Collection[MongoDBDocumentType]] = None,
|
collection: Optional[Collection[MongoDBDocumentType]] = None,
|
||||||
**kwargs: Any,
|
**kwargs: Any,
|
||||||
) -> MongoDBAtlasVectorSearch:
|
) -> MongoDBAtlasVectorSearch:
|
||||||
"""Construct MongoDBAtlasVectorSearch wrapper from raw documents.
|
"""Construct a `MongoDB Atlas Vector Search` vector store from raw documents.
|
||||||
|
|
||||||
This is a user-friendly interface that:
|
This is a user-friendly interface that:
|
||||||
1. Embeds documents.
|
1. Embeds documents.
|
||||||
|
@ -3,7 +3,7 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import os
|
import os
|
||||||
from time import sleep
|
from time import sleep
|
||||||
from typing import TYPE_CHECKING, Any
|
from typing import Any
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
@ -11,41 +11,46 @@ from langchain.docstore.document import Document
|
|||||||
from langchain.schema.embeddings import Embeddings
|
from langchain.schema.embeddings import Embeddings
|
||||||
from langchain.vectorstores.mongodb_atlas import MongoDBAtlasVectorSearch
|
from langchain.vectorstores.mongodb_atlas import MongoDBAtlasVectorSearch
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from pymongo import MongoClient
|
|
||||||
|
|
||||||
INDEX_NAME = "langchain-test-index"
|
INDEX_NAME = "langchain-test-index"
|
||||||
NAMESPACE = "langchain_test_db.langchain_test_collection"
|
NAMESPACE = "langchain_test_db.langchain_test_collection"
|
||||||
CONNECTION_STRING = os.environ.get("MONGODB_ATLAS_URI")
|
CONNECTION_STRING = os.environ.get("MONGODB_ATLAS_URI")
|
||||||
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
|
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
|
||||||
|
|
||||||
# Instantiate as constant instead of pytest fixture to prevent needing to make multiple
|
|
||||||
# connections.
|
|
||||||
|
|
||||||
|
def get_collection() -> Any:
|
||||||
|
from pymongo import MongoClient
|
||||||
|
|
||||||
@pytest.fixture
|
test_client: MongoClient = MongoClient(CONNECTION_STRING)
|
||||||
def collection() -> Any:
|
|
||||||
test_client = MongoClient(CONNECTION_STRING)
|
|
||||||
return test_client[DB_NAME][COLLECTION_NAME]
|
return test_client[DB_NAME][COLLECTION_NAME]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def collection() -> Any:
|
||||||
|
return get_collection()
|
||||||
|
|
||||||
|
|
||||||
class TestMongoDBAtlasVectorSearch:
|
class TestMongoDBAtlasVectorSearch:
|
||||||
@classmethod
|
@classmethod
|
||||||
def setup_class(cls, collection: Any) -> None:
|
def setup_class(cls) -> None:
|
||||||
# insure the test collection is empty
|
# insure the test collection is empty
|
||||||
|
collection = get_collection()
|
||||||
assert collection.count_documents({}) == 0 # type: ignore[index] # noqa: E501
|
assert collection.count_documents({}) == 0 # type: ignore[index] # noqa: E501
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def teardown_class(cls, collection: Any) -> None:
|
def teardown_class(cls) -> None:
|
||||||
|
collection = get_collection()
|
||||||
# delete all the documents in the collection
|
# delete all the documents in the collection
|
||||||
collection.delete_many({}) # type: ignore[index]
|
collection.delete_many({}) # type: ignore[index]
|
||||||
|
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
def setup(self, collection: Any) -> None:
|
def setup(self) -> None:
|
||||||
|
collection = get_collection()
|
||||||
# delete all the documents in the collection
|
# delete all the documents in the collection
|
||||||
collection.delete_many({}) # type: ignore[index]
|
collection.delete_many({}) # type: ignore[index]
|
||||||
|
|
||||||
def test_from_documents(self, embedding_openai: Embeddings) -> None:
|
def test_from_documents(
|
||||||
|
self, embedding_openai: Embeddings, collection: Any
|
||||||
|
) -> None:
|
||||||
"""Test end to end construction and search."""
|
"""Test end to end construction and search."""
|
||||||
documents = [
|
documents = [
|
||||||
Document(page_content="Dogs are tough.", metadata={"a": 1}),
|
Document(page_content="Dogs are tough.", metadata={"a": 1}),
|
||||||
@ -64,7 +69,7 @@ class TestMongoDBAtlasVectorSearch:
|
|||||||
assert output[0].page_content == "What is a sandwich?"
|
assert output[0].page_content == "What is a sandwich?"
|
||||||
assert output[0].metadata["c"] == 1
|
assert output[0].metadata["c"] == 1
|
||||||
|
|
||||||
def test_from_texts(self, embedding_openai: Embeddings) -> None:
|
def test_from_texts(self, embedding_openai: Embeddings, collection: Any) -> None:
|
||||||
texts = [
|
texts = [
|
||||||
"Dogs are tough.",
|
"Dogs are tough.",
|
||||||
"Cats have fluff.",
|
"Cats have fluff.",
|
||||||
@ -81,7 +86,9 @@ class TestMongoDBAtlasVectorSearch:
|
|||||||
output = vectorstore.similarity_search("Sandwich", k=1)
|
output = vectorstore.similarity_search("Sandwich", k=1)
|
||||||
assert output[0].page_content == "What is a sandwich?"
|
assert output[0].page_content == "What is a sandwich?"
|
||||||
|
|
||||||
def test_from_texts_with_metadatas(self, embedding_openai: Embeddings) -> None:
|
def test_from_texts_with_metadatas(
|
||||||
|
self, embedding_openai: Embeddings, collection: Any
|
||||||
|
) -> None:
|
||||||
texts = [
|
texts = [
|
||||||
"Dogs are tough.",
|
"Dogs are tough.",
|
||||||
"Cats have fluff.",
|
"Cats have fluff.",
|
||||||
@ -102,7 +109,7 @@ class TestMongoDBAtlasVectorSearch:
|
|||||||
assert output[0].metadata["c"] == 1
|
assert output[0].metadata["c"] == 1
|
||||||
|
|
||||||
def test_from_texts_with_metadatas_and_pre_filter(
|
def test_from_texts_with_metadatas_and_pre_filter(
|
||||||
self, embedding_openai: Embeddings
|
self, embedding_openai: Embeddings, collection: Any
|
||||||
) -> None:
|
) -> None:
|
||||||
texts = [
|
texts = [
|
||||||
"Dogs are tough.",
|
"Dogs are tough.",
|
||||||
@ -124,7 +131,7 @@ class TestMongoDBAtlasVectorSearch:
|
|||||||
)
|
)
|
||||||
assert output == []
|
assert output == []
|
||||||
|
|
||||||
def test_mmr(self, embedding_openai: Embeddings) -> None:
|
def test_mmr(self, embedding_openai: Embeddings, collection: Any) -> None:
|
||||||
texts = ["foo", "foo", "fou", "foy"]
|
texts = ["foo", "foo", "fou", "foy"]
|
||||||
vectorstore = MongoDBAtlasVectorSearch.from_texts(
|
vectorstore = MongoDBAtlasVectorSearch.from_texts(
|
||||||
texts,
|
texts,
|
||||||
|
Loading…
Reference in New Issue
Block a user