adding MongoDBAtlasVectorSearch (#5338)

# Add MongoDBAtlasVectorSearch for the python library

Fixes #5337
---------

Co-authored-by: Dev 2049 <dev.dev2049@gmail.com>
searx_updates
Paul-Emile Brotons 12 months ago committed by GitHub
parent c4b502a470
commit a61b7f7e7c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,170 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "683953b3",
"metadata": {},
"source": [
"# MongoDB Atlas Vector Search\n",
"\n",
">[MongoDB Atlas](https://www.mongodb.com/docs/atlas/) is a document database managed in the cloud. It also enables Lucene and its vector search feature.\n",
"\n",
"This notebook shows how to use the functionality related to the `MongoDB Atlas Vector Search` feature where you can store your embeddings in MongoDB documents and create a Lucene vector index to perform a KNN search.\n",
"\n",
"It uses the [knnBeta Operator](https://www.mongodb.com/docs/atlas/atlas-search/knn-beta) available in MongoDB Atlas Search. This feature is in early access and available only for evaluation purposes, to validate functionality, and to gather feedback from a small closed group of early access users. It is not recommended for production deployments as we may introduce breaking changes.\n",
"\n",
"To use MongoDB Atlas, you must have first deployed a cluster. Free clusters are available. \n",
"Here is the MongoDB Atlas [quick start](https://www.mongodb.com/docs/atlas/getting-started/)."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b4c41cad-08ef-4f72-a545-2151e4598efe",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"!pip install pymongo"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "c1e38361-c1fe-4ac6-86e9-c90ebaf7ae87",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"\n",
"MONGODB_ATLAS_URI = os.environ['MONGODB_ATLAS_URI']"
]
},
{
"cell_type": "markdown",
"id": "320af802-9271-46ee-948f-d2453933d44b",
"metadata": {},
"source": [
"We want to use `OpenAIEmbeddings` so we have to get the OpenAI API Key. Make sure the environment variable `OPENAI_API_KEY` is set up before proceeding."
]
},
{
"cell_type": "markdown",
"id": "1f3ecc42",
"metadata": {},
"source": [
"Now, let's create a Lucene vector index on your cluster. In the below example, `embedding` is the name of the field that contains the embedding vector. Please refer to the [documentation](https://www.mongodb.com/docs/atlas/atlas-search/define-field-mappings-for-vector-search) to get more details on how to define an Atlas Search index.\n",
"You can name the index `langchain_demo` and create the index on the namespace `lanchain_db.langchain_col`. Finally, write the following definition in the JSON editor:\n",
"\n",
"```json\n",
"{\n",
" \"mappings\": {\n",
" \"dynamic\": true,\n",
" \"fields\": {\n",
" \"embedding\": {\n",
" \"dimensions\": 1536,\n",
" \"similarity\": \"cosine\",\n",
" \"type\": \"knnVector\"\n",
" }\n",
" }\n",
" }\n",
"}\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "aac9563e",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores import MongoDBAtlasVectorSearch\n",
"from langchain.document_loaders import TextLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "a3c3999a",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import TextLoader\n",
"loader = TextLoader('../../../state_of_the_union.txt')\n",
"documents = loader.load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"docs = text_splitter.split_documents(documents)\n",
"\n",
"embeddings = OpenAIEmbeddings()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6e104aee",
"metadata": {},
"outputs": [],
"source": [
"from pymongo import MongoClient\n",
"\n",
"# initialize MongoDB python client\n",
"client = MongoClient(MONGODB_ATLAS_CONNECTION_STRING)\n",
"\n",
"db_name = \"lanchain_db\"\n",
"collection_name = \"langchain_col\"\n",
"namespace = f\"{db_name}.{collection_name}\"\n",
"index_name = \"langchain_demo\"\n",
"\n",
"# insert the documents in MongoDB Atlas with their embedding\n",
"docsearch = MongoDBAtlasVectorSearch.from_documents(\n",
" docs,\n",
" embeddings,\n",
" client=client,\n",
" namespace=namespace,\n",
" index_name=index_name\n",
")\n",
"\n",
"# perform a similarity search between the embedding of the query and the embeddings of the documents\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(query)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "9c608226",
"metadata": {},
"outputs": [],
"source": [
"print(docs[0].page_content)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

@ -10,6 +10,7 @@ from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch
from langchain.vectorstores.faiss import FAISS
from langchain.vectorstores.lancedb import LanceDB
from langchain.vectorstores.milvus import Milvus
from langchain.vectorstores.mongodb_atlas import MongoDBAtlasVectorSearch
from langchain.vectorstores.myscale import MyScale, MyScaleSettings
from langchain.vectorstores.opensearch_vector_search import OpenSearchVectorSearch
from langchain.vectorstores.pinecone import Pinecone
@ -38,6 +39,7 @@ __all__ = [
"AtlasDB",
"DeepLake",
"Annoy",
"MongoDBAtlasVectorSearch",
"MyScale",
"MyScaleSettings",
"SKLearnVectorStore",

@ -0,0 +1,270 @@
from __future__ import annotations
import logging
from typing import (
TYPE_CHECKING,
Any,
Dict,
Generator,
Iterable,
List,
Optional,
Tuple,
Union,
)
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.vectorstores.base import VectorStore
if TYPE_CHECKING:
from pymongo import MongoClient
logger = logging.getLogger(__name__)
DEFAULT_INSERT_BATCH_SIZE = 100
class MongoDBAtlasVectorSearch(VectorStore):
"""Wrapper around MongoDB Atlas Vector Search.
To use, you should have both:
- the ``pymongo`` python package installed
- a connection string associated with a MongoDB Atlas Cluster having deployed an
Atlas Search index
Example:
.. code-block:: python
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.embeddings.openai import OpenAIEmbeddings
from pymongo import MongoClient
mongo_client = MongoClient("<YOUR-CONNECTION-STRING>")
namespace = "<db_name>.<collection_name>"
embeddings = OpenAIEmbeddings()
vectorstore = MongoDBAtlasVectorSearch(mongo_client, namespace, embeddings)
"""
def __init__(
self,
client: MongoClient,
namespace: str,
embedding: Embeddings,
*,
index_name: str = "default",
text_key: str = "text",
embedding_key: str = "embedding",
):
"""
Args:
client: MongoDB client.
namespace: MongoDB namespace to add the texts to.
embedding: Text embedding model to use.
text_key: MongoDB field that will contain the text for each
document.
embedding_key: MongoDB field that will contain the embedding for
each document.
"""
self._client = client
db_name, collection_name = namespace.split(".")
self._collection = client[db_name][collection_name]
self._embedding = embedding
self._index_name = index_name
self._text_key = text_key
self._embedding_key = embedding_key
@classmethod
def from_connection_string(
cls,
connection_string: str,
namespace: str,
embedding: Embeddings,
**kwargs: Any,
) -> MongoDBAtlasVectorSearch:
try:
from pymongo import MongoClient
except ImportError:
raise ImportError(
"Could not import pymongo, please install it with "
"`pip install pymongo`."
)
client: MongoClient = MongoClient(connection_string)
return cls(client, namespace, embedding, **kwargs)
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[Dict[str, Any]]] = None,
**kwargs: Any,
) -> List:
"""Run more texts through the embeddings and add to the vectorstore.
Args:
texts: Iterable of strings to add to the vectorstore.
metadatas: Optional list of metadatas associated with the texts.
Returns:
List of ids from adding the texts into the vectorstore.
"""
batch_size = kwargs.get("batch_size", DEFAULT_INSERT_BATCH_SIZE)
_metadatas: Union[List, Generator] = metadatas or ({} for _ in texts)
texts_batch = []
metadatas_batch = []
result_ids = []
for i, (text, metadata) in enumerate(zip(texts, _metadatas)):
texts_batch.append(text)
metadatas_batch.append(metadata)
if (i + 1) % batch_size == 0:
result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
texts_batch = []
metadatas_batch = []
if texts_batch:
result_ids.extend(self._insert_texts(texts_batch, metadatas_batch))
return result_ids
def _insert_texts(self, texts: List[str], metadatas: List[Dict[str, Any]]) -> List:
if not texts:
return []
# Embed and create the documents
embeddings = self._embedding.embed_documents(texts)
to_insert = [
{self._text_key: t, self._embedding_key: embedding, **m}
for t, m, embedding in zip(texts, metadatas, embeddings)
]
# insert the documents in MongoDB Atlas
insert_result = self._collection.insert_many(to_insert)
return insert_result.inserted_ids
def similarity_search_with_score(
self,
query: str,
*,
k: int = 4,
pre_filter: Optional[dict] = None,
post_filter_pipeline: Optional[List[Dict]] = None,
) -> List[Tuple[Document, float]]:
"""Return MongoDB documents most similar to query, along with scores.
Use the knnBeta Operator available in MongoDB Atlas Search
This feature is in early access and available only for evaluation purposes, to
validate functionality, and to gather feedback from a small closed group of
early access users. It is not recommended for production deployments as we
may introduce breaking changes.
For more: https://www.mongodb.com/docs/atlas/atlas-search/knn-beta
Args:
query: Text to look up documents similar to.
k: Optional Number of Documents to return. Defaults to 4.
pre_filter: Optional Dictionary of argument(s) to prefilter on document
fields.
post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages
following the knnBeta search.
Returns:
List of Documents most similar to the query and score for each
"""
knn_beta = {
"vector": self._embedding.embed_query(query),
"path": self._embedding_key,
"k": k,
}
if pre_filter:
knn_beta["filter"] = pre_filter
pipeline = [
{
"$search": {
"index": self._index_name,
"knnBeta": knn_beta,
}
},
{"$project": {"score": {"$meta": "searchScore"}, self._embedding_key: 0}},
]
if post_filter_pipeline is not None:
pipeline.extend(post_filter_pipeline)
cursor = self._collection.aggregate(pipeline)
docs = []
for res in cursor:
text = res.pop(self._text_key)
score = res.pop("score")
docs.append((Document(page_content=text, metadata=res), score))
return docs
def similarity_search(
self,
query: str,
k: int = 4,
pre_filter: Optional[dict] = None,
post_filter_pipeline: Optional[List[Dict]] = None,
**kwargs: Any,
) -> List[Document]:
"""Return MongoDB documents most similar to query.
Use the knnBeta Operator available in MongoDB Atlas Search
This feature is in early access and available only for evaluation purposes, to
validate functionality, and to gather feedback from a small closed group of
early access users. It is not recommended for production deployments as we may
introduce breaking changes.
For more: https://www.mongodb.com/docs/atlas/atlas-search/knn-beta
Args:
query: Text to look up documents similar to.
k: Optional Number of Documents to return. Defaults to 4.
pre_filter: Optional Dictionary of argument(s) to prefilter on document
fields.
post_filter_pipeline: Optional Pipeline of MongoDB aggregation stages
following the knnBeta search.
Returns:
List of Documents most similar to the query and score for each
"""
docs_and_scores = self.similarity_search_with_score(
query,
k=k,
pre_filter=pre_filter,
post_filter_pipeline=post_filter_pipeline,
)
return [doc for doc, _ in docs_and_scores]
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
client: Optional[MongoClient] = None,
namespace: Optional[str] = None,
**kwargs: Any,
) -> MongoDBAtlasVectorSearch:
"""Construct MongoDBAtlasVectorSearch wrapper from raw documents.
This is a user-friendly interface that:
1. Embeds documents.
2. Adds the documents to a provided MongoDB Atlas Vector Search index
(Lucene)
This is intended to be a quick way to get started.
Example:
.. code-block:: python
from pymongo import MongoClient
from langchain.vectorstores import MongoDBAtlasVectorSearch
from langchain.embeddings import OpenAIEmbeddings
client = MongoClient("<YOUR-CONNECTION-STRING>")
namespace = "<db_name>.<collection_name>"
embeddings = OpenAIEmbeddings()
vectorstore = MongoDBAtlasVectorSearch.from_texts(
texts,
embeddings,
metadatas=metadatas,
client=client,
namespace=namespace
)
"""
if not client or not namespace:
raise ValueError("Must provide 'client' and 'namespace' named parameters.")
vecstore = cls(client, namespace, embedding, **kwargs)
vecstore.add_texts(texts, metadatas=metadatas)
return vecstore

6
poetry.lock generated

@ -6965,7 +6965,7 @@ tests = ["duckdb", "polars[pandas,pyarrow]", "pytest"]
name = "pymongo"
version = "4.3.3"
description = "Python driver for MongoDB <http://www.mongodb.org>"
category = "dev"
category = "main"
optional = false
python-versions = ">=3.7"
files = [
@ -10948,7 +10948,7 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
cffi = ["cffi (>=1.11)"]
[extras]
all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "langkit", "lark", "lxml", "manifest-ml", "momento", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "pyvespa", "qdrant-client", "redis", "requests-toolbelt", "sentence-transformers", "spacy", "steamship", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
all = ["O365", "aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "docarray", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "jq", "lancedb", "langkit", "lark", "lxml", "manifest-ml", "momento", "neo4j", "networkx", "nlpcloud", "nltk", "nomic", "openai", "openlm", "opensearch-py", "pdfminer-six", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pymongo", "pyowm", "pypdf", "pytesseract", "pyvespa", "qdrant-client", "redis", "requests-toolbelt", "sentence-transformers", "spacy", "steamship", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
azure = ["azure-ai-formrecognizer", "azure-ai-vision", "azure-cognitiveservices-speech", "azure-core", "azure-cosmos", "azure-identity", "openai"]
cohere = ["cohere"]
docarray = ["docarray"]
@ -10962,4 +10962,4 @@ text-helpers = ["chardet"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "b3dc23f376de141d22b729d038144a1e6d66983a910160c3500fe0d79f8e5917"
content-hash = "937d2f0165f6aa381ea1e26002272a92b189ab18607bd05895e36d23f56978f4"

@ -36,6 +36,7 @@ jinja2 = {version = "^3", optional = true}
tiktoken = {version = "^0.3.2", optional = true, python="^3.9"}
pinecone-client = {version = "^2", optional = true}
pinecone-text = {version = "^0.4.2", optional = true}
pymongo = {version = "^4.3.3", optional = true}
clickhouse-connect = {version="^0.5.14", optional=true}
weaviate-client = {version = "^3", optional = true}
google-api-python-client = {version = "2.70.0", optional = true}
@ -159,6 +160,7 @@ elasticsearch = {extras = ["async"], version = "^8.6.2"}
redis = "^4.5.4"
pinecone-client = "^2.2.1"
pinecone-text = "^0.4.2"
pymongo = "^4.3.3"
clickhouse-connect = "^0.5.14"
pgvector = "^0.1.6"
transformers = "^4.27.4"
@ -174,7 +176,6 @@ gptcache = "^0.1.9"
promptlayer = "^0.1.80"
tair = "^1.3.3"
wikipedia = "^1"
pymongo = "^4.3.3"
cassandra-driver = "^3.27.0"
arxiv = "^1.4"
mastodon-py = "^1.8.1"
@ -234,6 +235,7 @@ all = [
"jinja2",
"pinecone-client",
"pinecone-text",
"pymongo",
"weaviate-client",
"redis",
"google-api-python-client",

@ -22,4 +22,8 @@ PINECONE_ENVIRONMENT=us-west4-gcp
# details here https://learn.microsoft.com/en-us/dotnet/api/azure.identity.defaultazurecredential?view=azure-dotnet
POWERBI_DATASET_ID=_powerbi_dataset_id_here
POWERBI_TABLE_NAME=_test_table_name_here
POWERBI_NUMROWS=_num_rows_in_your_test_table
POWERBI_NUMROWS=_num_rows_in_your_test_table
# MongoDB Atlas Vector Search
MONGODB_ATLAS_URI=your_mongodb_atlas_connection_string

@ -0,0 +1,135 @@
"""Test MongoDB Atlas Vector Search functionality."""
from __future__ import annotations
import os
from time import sleep
from typing import TYPE_CHECKING, Optional
import pytest
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
from langchain.vectorstores.mongodb_atlas import MongoDBAtlasVectorSearch
if TYPE_CHECKING:
from pymongo import MongoClient
INDEX_NAME = "langchain-test-index"
NAMESPACE = "langchain_test_db.langchain_test_collection"
CONNECTION_STRING = os.environ.get("MONGODB_ATLAS_URI")
DB_NAME, COLLECTION_NAME = NAMESPACE.split(".")
def get_test_client() -> Optional[MongoClient]:
try:
from pymongo import MongoClient
client: MongoClient = MongoClient(CONNECTION_STRING)
return client
except: # noqa: E722
return None
# Instantiate as constant instead of pytest fixture to prevent needing to make multiple
# connections.
TEST_CLIENT = get_test_client()
class TestMongoDBAtlasVectorSearch:
@classmethod
def setup_class(cls) -> None:
# insure the test collection is empty
assert TEST_CLIENT[DB_NAME][COLLECTION_NAME].count_documents({}) == 0 # type: ignore[index] # noqa: E501
@classmethod
def teardown_class(cls) -> None:
# delete all the documents in the collection
TEST_CLIENT[DB_NAME][COLLECTION_NAME].delete_many({}) # type: ignore[index]
@pytest.fixture(autouse=True)
def setup(self) -> None:
# delete all the documents in the collection
TEST_CLIENT[DB_NAME][COLLECTION_NAME].delete_many({}) # type: ignore[index]
def test_from_documents(self, embedding_openai: Embeddings) -> None:
"""Test end to end construction and search."""
documents = [
Document(page_content="Dogs are tough.", metadata={"a": 1}),
Document(page_content="Cats have fluff.", metadata={"b": 1}),
Document(page_content="What is a sandwich?", metadata={"c": 1}),
Document(page_content="That fence is purple.", metadata={"d": 1, "e": 2}),
]
vectorstore = MongoDBAtlasVectorSearch.from_documents(
documents,
embedding_openai,
client=TEST_CLIENT,
namespace=NAMESPACE,
index_name=INDEX_NAME,
)
sleep(1) # waits for mongot to update Lucene's index
output = vectorstore.similarity_search("Sandwich", k=1)
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
def test_from_texts(self, embedding_openai: Embeddings) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"That fence is purple.",
]
vectorstore = MongoDBAtlasVectorSearch.from_texts(
texts,
embedding_openai,
client=TEST_CLIENT,
namespace=NAMESPACE,
index_name=INDEX_NAME,
)
sleep(1) # waits for mongot to update Lucene's index
output = vectorstore.similarity_search("Sandwich", k=1)
assert output[0].page_content == "What is a sandwich?"
def test_from_texts_with_metadatas(self, embedding_openai: Embeddings) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"The fence is purple.",
]
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
vectorstore = MongoDBAtlasVectorSearch.from_texts(
texts,
embedding_openai,
metadatas=metadatas,
client=TEST_CLIENT,
namespace=NAMESPACE,
index_name=INDEX_NAME,
)
sleep(1) # waits for mongot to update Lucene's index
output = vectorstore.similarity_search("Sandwich", k=1)
assert output[0].page_content == "What is a sandwich?"
assert output[0].metadata["c"] == 1
def test_from_texts_with_metadatas_and_pre_filter(
self, embedding_openai: Embeddings
) -> None:
texts = [
"Dogs are tough.",
"Cats have fluff.",
"What is a sandwich?",
"The fence is purple.",
]
metadatas = [{"a": 1}, {"b": 1}, {"c": 1}, {"d": 1, "e": 2}]
vectorstore = MongoDBAtlasVectorSearch.from_texts(
texts,
embedding_openai,
metadatas=metadatas,
client=TEST_CLIENT,
namespace=NAMESPACE,
index_name=INDEX_NAME,
)
sleep(1) # waits for mongot to update Lucene's index
output = vectorstore.similarity_search(
"Sandwich", k=1, pre_filter={"range": {"lte": 0, "path": "c"}}
)
assert output == []
Loading…
Cancel
Save