mirror of https://github.com/hwchase17/langchain
Add: USearch Vector Store (#8835)
## Description I am excited to propose an integration with USearch, a lightweight vector-search engine available for both Python and JavaScript, among other languages. ## Dependencies It introduces a new PyPi dependency - `usearch`. I am unsure if it must be added to the Poetry file, as this would make the PR too clunky. Please let me know. ## Profiles - Maintainers: @ashvardanian @davvard - Twitter handles: @ashvardanian @unum_cloud --------- Co-authored-by: Davit Vardanyan <78792753+davvard@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com>pull/8903/head
parent
b52a3785c9
commit
1f9124ceaa
@ -0,0 +1,176 @@
|
|||||||
|
"""Wrapper around USearch vector database."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from langchain.docstore.base import AddableMixin, Docstore
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.docstore.in_memory import InMemoryDocstore
|
||||||
|
from langchain.embeddings.base import Embeddings
|
||||||
|
from langchain.vectorstores.base import VectorStore
|
||||||
|
|
||||||
|
|
||||||
|
def dependable_usearch_import() -> Any:
|
||||||
|
"""
|
||||||
|
Import usearch if available, otherwise raise error.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
import usearch.index
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"Could not import usearch python package. "
|
||||||
|
"Please install it with `pip install usearch` "
|
||||||
|
)
|
||||||
|
return usearch.index
|
||||||
|
|
||||||
|
|
||||||
|
class USearch(VectorStore):
|
||||||
|
"""Wrapper around USearch vector database.
|
||||||
|
To use, you should have the ``usearch`` python package installed.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
embedding: Embeddings,
|
||||||
|
index: Any,
|
||||||
|
docstore: Docstore,
|
||||||
|
ids: List[str],
|
||||||
|
):
|
||||||
|
"""Initialize with necessary components."""
|
||||||
|
self.embedding = embedding
|
||||||
|
self.index = index
|
||||||
|
self.docstore = docstore
|
||||||
|
self.ids = ids
|
||||||
|
|
||||||
|
def add_texts(
|
||||||
|
self,
|
||||||
|
texts: Iterable[str],
|
||||||
|
metadatas: Optional[List[Dict]] = None,
|
||||||
|
ids: Optional[np.ndarray] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[str]:
|
||||||
|
"""Run more texts through the embeddings and add to the vectorstore.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts: Iterable of strings to add to the vectorstore.
|
||||||
|
metadatas: Optional list of metadatas associated with the texts.
|
||||||
|
ids: Optional list of unique IDs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ids from adding the texts into the vectorstore.
|
||||||
|
"""
|
||||||
|
if not isinstance(self.docstore, AddableMixin):
|
||||||
|
raise ValueError(
|
||||||
|
"If trying to add texts, the underlying docstore should support "
|
||||||
|
f"adding items, which {self.docstore} does not"
|
||||||
|
)
|
||||||
|
|
||||||
|
embeddings = self.embedding.embed_documents(list(texts))
|
||||||
|
documents = []
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
metadata = metadatas[i] if metadatas else {}
|
||||||
|
documents.append(Document(page_content=text, metadata=metadata))
|
||||||
|
last_id = int(self.ids[-1]) + 1
|
||||||
|
if ids is None:
|
||||||
|
ids = np.array([str(last_id + id) for id, _ in enumerate(texts)])
|
||||||
|
|
||||||
|
self.index.add(np.array(ids), np.array(embeddings))
|
||||||
|
self.docstore.add(dict(zip(ids, documents)))
|
||||||
|
self.ids.extend(ids)
|
||||||
|
return ids.tolist()
|
||||||
|
|
||||||
|
def similarity_search_with_score(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
) -> List[Tuple[Document, float]]:
|
||||||
|
"""Return docs most similar to query.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Text to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 4.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of documents most similar to the query with distance.
|
||||||
|
"""
|
||||||
|
query_embedding = self.embedding.embed_query(query)
|
||||||
|
matches = self.index.search(np.array(query_embedding), k)
|
||||||
|
|
||||||
|
docs_with_scores: List[Tuple[Document, float]] = []
|
||||||
|
for id, score in zip(matches.keys, matches.distances):
|
||||||
|
doc = self.docstore.search(str(id))
|
||||||
|
if not isinstance(doc, Document):
|
||||||
|
raise ValueError(f"Could not find document for id {id}, got {doc}")
|
||||||
|
docs_with_scores.append((doc, score))
|
||||||
|
|
||||||
|
return docs_with_scores
|
||||||
|
|
||||||
|
def similarity_search(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
k: int = 4,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs most similar to query.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: Text to look up documents similar to.
|
||||||
|
k: Number of Documents to return. Defaults to 4.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of Documents most similar to the query.
|
||||||
|
"""
|
||||||
|
query_embedding = self.embedding.embed_query(query)
|
||||||
|
matches = self.index.search(np.array(query_embedding), k)
|
||||||
|
|
||||||
|
docs: List[Document] = []
|
||||||
|
for id in matches.keys:
|
||||||
|
doc = self.docstore.search(str(id))
|
||||||
|
if not isinstance(doc, Document):
|
||||||
|
raise ValueError(f"Could not find document for id {id}, got {doc}")
|
||||||
|
docs.append(doc)
|
||||||
|
|
||||||
|
return docs
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_texts(
|
||||||
|
cls,
|
||||||
|
texts: List[str],
|
||||||
|
embedding: Embeddings,
|
||||||
|
metadatas: Optional[List[Dict]] = None,
|
||||||
|
ids: Optional[np.ndarray] = None,
|
||||||
|
metric: str = "cos",
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> USearch:
|
||||||
|
"""Construct USearch wrapper from raw documents.
|
||||||
|
This is a user friendly interface that:
|
||||||
|
1. Embeds documents.
|
||||||
|
2. Creates an in memory docstore
|
||||||
|
3. Initializes the USearch database
|
||||||
|
This is intended to be a quick way to get started.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain.vectorstores import USearch
|
||||||
|
from langchain.embeddings import OpenAIEmbeddings
|
||||||
|
|
||||||
|
embeddings = OpenAIEmbeddings()
|
||||||
|
usearch = USearch.from_texts(texts, embeddings)
|
||||||
|
"""
|
||||||
|
embeddings = embedding.embed_documents(texts)
|
||||||
|
|
||||||
|
documents: List[Document] = []
|
||||||
|
if ids is None:
|
||||||
|
ids = np.array([str(id) for id, _ in enumerate(texts)])
|
||||||
|
for i, text in enumerate(texts):
|
||||||
|
metadata = metadatas[i] if metadatas else {}
|
||||||
|
documents.append(Document(page_content=text, metadata=metadata))
|
||||||
|
|
||||||
|
docstore = InMemoryDocstore(dict(zip(ids, documents)))
|
||||||
|
usearch = dependable_usearch_import()
|
||||||
|
index = usearch.Index(ndim=len(embeddings[0]), metric=metric)
|
||||||
|
index.add(np.array(ids), np.array(embeddings))
|
||||||
|
return cls(embedding, index, docstore, ids.tolist())
|
@ -0,0 +1,59 @@
|
|||||||
|
"""Test USearch functionality."""
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.vectorstores.usearch import USearch
|
||||||
|
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
||||||
|
|
||||||
|
|
||||||
|
def test_usearch_from_texts() -> None:
|
||||||
|
"""Test end to end construction and search."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
docsearch = USearch.from_texts(texts, FakeEmbeddings())
|
||||||
|
output = docsearch.similarity_search("foo", k=1)
|
||||||
|
assert output == [Document(page_content="foo")]
|
||||||
|
|
||||||
|
|
||||||
|
def test_usearch_from_documents() -> None:
|
||||||
|
"""Test from_documents constructor."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
docs = [Document(page_content=t, metadata={"a": "b"}) for t in texts]
|
||||||
|
docsearch = USearch.from_documents(docs, FakeEmbeddings())
|
||||||
|
output = docsearch.similarity_search("foo", k=1)
|
||||||
|
assert output == [Document(page_content="foo", metadata={"a": "b"})]
|
||||||
|
|
||||||
|
|
||||||
|
def test_usearch_add_texts() -> None:
|
||||||
|
"""Test adding a new document"""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
docsearch = USearch.from_texts(texts, FakeEmbeddings())
|
||||||
|
docsearch.add_texts(["foo"])
|
||||||
|
output = docsearch.similarity_search("foo", k=2)
|
||||||
|
assert output == [Document(page_content="foo"), Document(page_content="foo")]
|
||||||
|
|
||||||
|
|
||||||
|
def test_ip() -> None:
|
||||||
|
"""Test inner product distance."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
docsearch = USearch.from_texts(texts, FakeEmbeddings(), metric="ip")
|
||||||
|
output = docsearch.similarity_search_with_score("far", k=2)
|
||||||
|
_, score = output[1]
|
||||||
|
assert score == -8.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_l2() -> None:
|
||||||
|
"""Test Flat L2 distance."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
docsearch = USearch.from_texts(texts, FakeEmbeddings(), metric="l2_sq")
|
||||||
|
output = docsearch.similarity_search_with_score("far", k=2)
|
||||||
|
_, score = output[1]
|
||||||
|
assert score == 1.0
|
||||||
|
|
||||||
|
|
||||||
|
def test_cos() -> None:
|
||||||
|
"""Test cosine distance."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
docsearch = USearch.from_texts(texts, FakeEmbeddings(), metric="cos")
|
||||||
|
output = docsearch.similarity_search_with_score("far", k=2)
|
||||||
|
_, score = output[1]
|
||||||
|
assert score == pytest.approx(0.05, abs=0.002)
|
Loading…
Reference in New Issue