forked from Archives/langchain
ab44c24333
Hello Folks, Thanks for creating and maintaining this great project. I'm excited to submit this PR to add Alibaba Cloud OpenSearch as a new vector store. OpenSearch is a one-stop platform to develop intelligent search services. OpenSearch was built based on the large-scale distributed search engine developed by Alibaba. OpenSearch serves more than 500 business cases in Alibaba Group and thousands of Alibaba Cloud customers. OpenSearch helps develop search services in different search scenarios, including e-commerce, O2O, multimedia, the content industry, communities and forums, and big data query in enterprises. OpenSearch provides the vector search feature. In specific scenarios, especially test question search and image search scenarios, you can use the vector search feature together with the multimodal search feature to improve the accuracy of search results. This PR includes: A AlibabaCloudOpenSearch class that can connect to the Alibaba Cloud OpenSearch instance. add embedings and metadata into a opensearch datasource. querying by squared euclidean and metadata. integration tests. ipython notebook and docs. I have read your contributing guidelines. And I have passed the tests below - [x] make format - [x] make lint - [x] make coverage - [x] make test --------- Co-authored-by: zhaoshengbo <shengbo.zsb@alibaba-inc.com>
129 lines
4.9 KiB
Python
129 lines
4.9 KiB
Python
from typing import List
|
||
|
||
from langchain.schema import Document
|
||
from langchain.vectorstores.alibabacloud_opensearch import (
|
||
AlibabaCloudOpenSearch,
|
||
AlibabaCloudOpenSearchSettings,
|
||
)
|
||
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
||
|
||
OS_TOKEN_COUNT = 1536
|
||
|
||
texts = ["foo", "bar", "baz"]
|
||
|
||
|
||
class FakeEmbeddingsWithOsDimension(FakeEmbeddings):
|
||
"""Fake embeddings functionality for testing."""
|
||
|
||
def embed_documents(self, embedding_texts: List[str]) -> List[List[float]]:
|
||
"""Return simple embeddings."""
|
||
return [
|
||
[float(1.0)] * (OS_TOKEN_COUNT - 1) + [float(i)]
|
||
for i in range(len(embedding_texts))
|
||
]
|
||
|
||
def embed_query(self, text: str) -> List[float]:
|
||
"""Return simple embeddings."""
|
||
return [float(1.0)] * (OS_TOKEN_COUNT - 1) + [float(texts.index(text))]
|
||
|
||
|
||
settings = AlibabaCloudOpenSearchSettings(
|
||
endpoint="The endpoint of opensearch instance, "
|
||
"You can find it from the console of Alibaba Cloud OpenSearch.",
|
||
instance_id="The identify of opensearch instance, "
|
||
"You can find it from the console of Alibaba Cloud OpenSearch.",
|
||
datasource_name="The name of the data source specified when creating it.",
|
||
username="The username specified when purchasing the instance.",
|
||
password="The password specified when purchasing the instance.",
|
||
embedding_index_name="The name of the vector attribute "
|
||
"specified when configuring the instance attributes.",
|
||
field_name_mapping={
|
||
# insert data into opensearch based on the mapping name of the field.
|
||
"id": "The id field name map of index document.",
|
||
"document": "The text field name map of index document.",
|
||
"embedding": "The embedding field name map of index document,"
|
||
"the values must be in float16 multivalue type "
|
||
"and separated by commas.",
|
||
"metadata_x": "The metadata field name map of index document, "
|
||
"could specify multiple, The value field contains "
|
||
"mapping name and operator, the operator would be "
|
||
"used when executing metadata filter query",
|
||
},
|
||
)
|
||
|
||
embeddings = FakeEmbeddingsWithOsDimension()
|
||
|
||
|
||
def test_create_alibabacloud_opensearch() -> None:
|
||
opensearch = create_alibabacloud_opensearch()
|
||
output = opensearch.similarity_search("foo", k=10)
|
||
assert len(output) == 3
|
||
|
||
|
||
def test_alibabacloud_opensearch_with_text_query() -> None:
|
||
opensearch = create_alibabacloud_opensearch()
|
||
output = opensearch.similarity_search("foo", k=1)
|
||
assert output == [Document(page_content="foo", metadata={"metadata": "0"})]
|
||
|
||
output = opensearch.similarity_search("bar", k=1)
|
||
assert output == [Document(page_content="bar", metadata={"metadata": "1"})]
|
||
|
||
output = opensearch.similarity_search("baz", k=1)
|
||
assert output == [Document(page_content="baz", metadata={"metadata": "2"})]
|
||
|
||
|
||
def test_alibabacloud_opensearch_with_vector_query() -> None:
|
||
opensearch = create_alibabacloud_opensearch()
|
||
output = opensearch.similarity_search_by_vector(embeddings.embed_query("foo"), k=1)
|
||
assert output == [Document(page_content="foo", metadata={"metadata": "0"})]
|
||
|
||
output = opensearch.similarity_search_by_vector(embeddings.embed_query("bar"), k=1)
|
||
assert output == [Document(page_content="bar", metadata={"metadata": "1"})]
|
||
|
||
output = opensearch.similarity_search_by_vector(embeddings.embed_query("baz"), k=1)
|
||
assert output == [Document(page_content="baz", metadata={"metadata": "2"})]
|
||
|
||
|
||
def test_alibabacloud_opensearch_with_text_and_meta_query() -> None:
|
||
opensearch = create_alibabacloud_opensearch()
|
||
output = opensearch.similarity_search(
|
||
query="foo", search_filter={"metadata": "0"}, k=1
|
||
)
|
||
assert output == [Document(page_content="foo", metadata={"metadata": "0"})]
|
||
|
||
output = opensearch.similarity_search(
|
||
query="bar", search_filter={"metadata": "1"}, k=1
|
||
)
|
||
assert output == [Document(page_content="bar", metadata={"metadata": "1"})]
|
||
|
||
output = opensearch.similarity_search(
|
||
query="baz", search_filter={"metadata": "2"}, k=1
|
||
)
|
||
assert output == [Document(page_content="baz", metadata={"metadata": "2"})]
|
||
|
||
output = opensearch.similarity_search(
|
||
query="baz", search_filter={"metadata": "3"}, k=1
|
||
)
|
||
assert len(output) == 0
|
||
|
||
|
||
def test_alibabacloud_opensearch_with_text_and_meta_score_query() -> None:
|
||
opensearch = create_alibabacloud_opensearch()
|
||
output = opensearch.similarity_search_with_relevance_scores(
|
||
query="foo", search_filter={"metadata": "0"}, k=1
|
||
)
|
||
assert output == [
|
||
(Document(page_content="foo", metadata={"metadata": "0"}), 10000.0)
|
||
]
|
||
|
||
|
||
def create_alibabacloud_opensearch() -> AlibabaCloudOpenSearch:
|
||
metadatas = [{"metadata": str(i)} for i in range(len(texts))]
|
||
|
||
return AlibabaCloudOpenSearch.from_texts(
|
||
texts=texts,
|
||
embedding=FakeEmbeddingsWithOsDimension(),
|
||
metadatas=metadatas,
|
||
config=settings,
|
||
)
|