mirror of https://github.com/hwchase17/langchain
Added AwaDB vector store, which is a wrapper over the AwaDB, that can be used as a vector storage and has an efficient similarity search. Added integration tests for the vector store Added jupyter notebook with the example Delete a unneeded empty file and resolve the conflict(https://github.com/hwchase17/langchain/pull/5886) Please check, Thanks! @dev2049 @hwchase17 --------- <!-- Thank you for contributing to LangChain! Your PR will appear in our release under the title you set. Please make sure it highlights your valuable contribution. Replace this with a description of the change, the issue it fixes (if applicable), and relevant context. List any dependencies required for this change. After you're done, someone will review your PR. They may suggest improvements. If no one reviews your PR within a few days, feel free to @-mention the same people again, as notifications can get lost. Finally, we'd love to show appreciation for your contribution - if you'd like us to shout you out on Twitter, please also include your handle! --> <!-- Remove if not applicable --> Fixes # (issue) #### Before submitting <!-- If you're adding a new integration, please include: 1. a test for the integration - favor unit tests that does not rely on network access. 2. an example notebook showing its use See contribution guidelines for more information on how to write tests, lint etc: https://github.com/hwchase17/langchain/blob/master/.github/CONTRIBUTING.md --> #### Who can review? Tag maintainers/contributors who might be interested: <!-- For a quicker response, figure out the right person to tag with @ @hwchase17 - project lead Tracing / Callbacks - @agola11 Async - @agola11 DataLoaders - @eyurtsev Models - @hwchase17 - @agola11 Agents / Tools / Toolkits - @vowelparrot VectorStores / Retrievers / Memory - @dev2049 --> --------- Co-authored-by: ljeagle <vincent_jieli@yeah.net> Co-authored-by: vincent <awadb.vincent@gmail.com>pull/6001/head
parent
d5819a7ca7
commit
9218684759
@ -0,0 +1,21 @@
|
||||
# AwaDB
|
||||
|
||||
>[AwaDB](https://github.com/awa-ai/awadb) is an AI Native database for the search and storage of embedding vectors used by LLM Applications.
|
||||
|
||||
## Installation and Setup
|
||||
|
||||
```bash
|
||||
pip install awadb
|
||||
```
|
||||
|
||||
|
||||
## VectorStore
|
||||
|
||||
There exists a wrapper around AwaDB vector databases, allowing you to use it as a vectorstore,
|
||||
whether for semantic search or example selection.
|
||||
|
||||
```python
|
||||
from langchain.vectorstores import AwaDB
|
||||
```
|
||||
|
||||
For a more detailed walkthrough of the AwaDB wrapper, see [this notebook](../modules/indexes/vectorstores/examples/awadb.ipynb)
|
@ -0,0 +1,284 @@
|
||||
"""Wrapper around AwaDB for embedding vectors"""
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Type
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.vectorstores.base import VectorStore
|
||||
|
||||
# from pydantic import BaseModel, Field, root_validator
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import awadb
|
||||
|
||||
logger = logging.getLogger()
|
||||
DEFAULT_TOPN = 4
|
||||
|
||||
|
||||
class AwaDB(VectorStore):
|
||||
"""Interface implemented by AwaDB vector stores."""
|
||||
|
||||
_DEFAULT_TABLE_NAME = "langchain_awadb"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
table_name: str = _DEFAULT_TABLE_NAME,
|
||||
embedding_model: Optional[Embeddings] = None,
|
||||
log_and_data_dir: Optional[str] = None,
|
||||
client: Optional[awadb.Client] = None,
|
||||
) -> None:
|
||||
"""Initialize with AwaDB client."""
|
||||
|
||||
try:
|
||||
import awadb
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"Could not import awadb python package. "
|
||||
"Please install it with `pip install awadb`."
|
||||
)
|
||||
|
||||
if client is not None:
|
||||
self.awadb_client = client
|
||||
else:
|
||||
if log_and_data_dir is not None:
|
||||
self.awadb_client = awadb.Client(log_and_data_dir)
|
||||
else:
|
||||
self.awadb_client = awadb.Client()
|
||||
|
||||
self.awadb_client.Create(table_name)
|
||||
if embedding_model is not None:
|
||||
self.embedding_model = embedding_model
|
||||
|
||||
self.added_doc_count = 0
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Run more texts through the embeddings and add to the vectorstore.
|
||||
Args:
|
||||
texts: Iterable of strings to add to the vectorstore.
|
||||
metadatas: Optional list of metadatas associated with the texts.
|
||||
kwargs: vectorstore specific parameters
|
||||
|
||||
Returns:
|
||||
List of ids from adding the texts into the vectorstore.
|
||||
"""
|
||||
if self.awadb_client is None:
|
||||
raise ValueError("AwaDB client is None!!!")
|
||||
|
||||
embeddings = None
|
||||
if self.embedding_model is not None:
|
||||
embeddings = self.embedding_model.embed_documents(list(texts))
|
||||
added_results: List[str] = []
|
||||
doc_no = 0
|
||||
for text in texts:
|
||||
doc: List[Any] = []
|
||||
if embeddings is not None:
|
||||
doc.append(text)
|
||||
doc.append(embeddings[doc_no])
|
||||
else:
|
||||
dict_tmp = {}
|
||||
dict_tmp["embedding_text"] = text
|
||||
doc.append(dict_tmp)
|
||||
|
||||
if metadatas is not None:
|
||||
if doc_no < metadatas.__len__():
|
||||
doc.append(metadatas[doc_no])
|
||||
self.awadb_client.Add(doc)
|
||||
added_results.append(str(self.added_doc_count))
|
||||
|
||||
doc_no = doc_no + 1
|
||||
self.added_doc_count = self.added_doc_count + 1
|
||||
|
||||
return added_results
|
||||
|
||||
def load_local(
|
||||
self,
|
||||
table_name: str = _DEFAULT_TABLE_NAME,
|
||||
**kwargs: Any,
|
||||
) -> bool:
|
||||
if self.awadb_client is None:
|
||||
raise ValueError("AwaDB client is None!!!")
|
||||
|
||||
return self.awadb_client.Load(table_name)
|
||||
|
||||
def similarity_search(
|
||||
self,
|
||||
query: str,
|
||||
k: int = DEFAULT_TOPN,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to query."""
|
||||
if self.awadb_client is None:
|
||||
raise ValueError("AwaDB client is None!!!")
|
||||
|
||||
embedding = None
|
||||
if self.embedding_model is not None:
|
||||
embedding = self.embedding_model.embed_query(query)
|
||||
|
||||
return self.similarity_search_by_vector(embedding, k)
|
||||
|
||||
def similarity_search_with_score(
|
||||
self,
|
||||
query: str,
|
||||
k: int = DEFAULT_TOPN,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs and relevance scores, normalized on a scale from 0 to 1.
|
||||
|
||||
0 is dissimilar, 1 is most similar.
|
||||
"""
|
||||
|
||||
if self.awadb_client is None:
|
||||
raise ValueError("AwaDB client is None!!!")
|
||||
|
||||
embedding = None
|
||||
if self.embedding_model is not None:
|
||||
embedding = self.embedding_model.embed_query(query)
|
||||
|
||||
show_results = self.awadb_client.Search(embedding, k)
|
||||
|
||||
results: List[Tuple[Document, float]] = []
|
||||
|
||||
if show_results.__len__() == 0:
|
||||
return results
|
||||
|
||||
scores: List[float] = []
|
||||
retrieval_docs = self.similarity_search_by_vector(embedding, k, scores)
|
||||
|
||||
L2_Norm = 0.0
|
||||
for score in scores:
|
||||
L2_Norm = L2_Norm + score * score
|
||||
|
||||
L2_Norm = pow(L2_Norm, 0.5)
|
||||
doc_no = 0
|
||||
for doc in retrieval_docs:
|
||||
doc_tuple = (doc, 1 - scores[doc_no] / L2_Norm)
|
||||
results.append(doc_tuple)
|
||||
doc_no = doc_no + 1
|
||||
|
||||
return results
|
||||
|
||||
def similarity_search_with_relevance_scores(
|
||||
self,
|
||||
query: str,
|
||||
k: int = DEFAULT_TOPN,
|
||||
**kwargs: Any,
|
||||
) -> List[Tuple[Document, float]]:
|
||||
"""Return docs and relevance scores, normalized on a scale from 0 to 1.
|
||||
|
||||
0 is dissimilar, 1 is most similar.
|
||||
"""
|
||||
|
||||
if self.awadb_client is None:
|
||||
raise ValueError("AwaDB client is None!!!")
|
||||
|
||||
embedding = None
|
||||
if self.embedding_model is not None:
|
||||
embedding = self.embedding_model.embed_query(query)
|
||||
|
||||
show_results = self.awadb_client.Search(embedding, k)
|
||||
|
||||
results: List[Tuple[Document, float]] = []
|
||||
|
||||
if show_results.__len__() == 0:
|
||||
return results
|
||||
|
||||
scores: List[float] = []
|
||||
retrieval_docs = self.similarity_search_by_vector(embedding, k, scores)
|
||||
|
||||
L2_Norm = 0.0
|
||||
for score in scores:
|
||||
L2_Norm = L2_Norm + score * score
|
||||
|
||||
L2_Norm = pow(L2_Norm, 0.5)
|
||||
doc_no = 0
|
||||
for doc in retrieval_docs:
|
||||
doc_tuple = (doc, 1 - scores[doc_no] / L2_Norm)
|
||||
results.append(doc_tuple)
|
||||
doc_no = doc_no + 1
|
||||
|
||||
return results
|
||||
|
||||
def similarity_search_by_vector(
|
||||
self,
|
||||
embedding: List[float],
|
||||
k: int = DEFAULT_TOPN,
|
||||
scores: Optional[list] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
"""Return docs most similar to embedding vector.
|
||||
|
||||
Args:
|
||||
embedding: Embedding to look up documents similar to.
|
||||
k: Number of Documents to return. Defaults to 4.
|
||||
|
||||
Returns:
|
||||
List of Documents most similar to the query vector.
|
||||
"""
|
||||
|
||||
if self.awadb_client is None:
|
||||
raise ValueError("AwaDB client is None!!!")
|
||||
|
||||
show_results = self.awadb_client.Search(embedding, k)
|
||||
|
||||
results: List[Document] = []
|
||||
|
||||
if show_results.__len__() == 0:
|
||||
return results
|
||||
|
||||
for item_detail in show_results[0]["ResultItems"]:
|
||||
content = ""
|
||||
meta_data = {}
|
||||
for item_key in item_detail:
|
||||
if item_key == "Field@0": # text for the document
|
||||
content = item_detail[item_key]
|
||||
elif item_key == "Field@1": # embedding field for the document
|
||||
continue
|
||||
elif item_key == "score": # L2 distance
|
||||
if scores is not None:
|
||||
score = item_detail[item_key]
|
||||
scores.append(score)
|
||||
else:
|
||||
meta_data[item_key] = item_detail[item_key]
|
||||
results.append(Document(page_content=content, metadata=meta_data))
|
||||
return results
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls: Type[AwaDB],
|
||||
texts: List[str],
|
||||
embedding: Optional[Embeddings] = None,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
table_name: str = _DEFAULT_TABLE_NAME,
|
||||
logging_and_data_dir: Optional[str] = None,
|
||||
client: Optional[awadb.Client] = None,
|
||||
**kwargs: Any,
|
||||
) -> AwaDB:
|
||||
"""Create an AwaDB vectorstore from a raw documents.
|
||||
|
||||
Args:
|
||||
texts (List[str]): List of texts to add to the table.
|
||||
embedding (Optional[Embeddings]): Embedding function. Defaults to None.
|
||||
metadatas (Optional[List[dict]]): List of metadatas. Defaults to None.
|
||||
table_name (str): Name of the table to create.
|
||||
logging_and_data_dir (Optional[str]): Directory of logging and persistence.
|
||||
client (Optional[awadb.Client]): AwaDB client
|
||||
|
||||
Returns:
|
||||
AwaDB: AwaDB vectorstore.
|
||||
"""
|
||||
awadb_client = cls(
|
||||
table_name=table_name,
|
||||
embedding_model=embedding,
|
||||
log_and_data_dir=logging_and_data_dir,
|
||||
client=client,
|
||||
)
|
||||
awadb_client.add_texts(texts=texts, metadatas=metadatas)
|
||||
return awadb_client
|
@ -0,0 +1,55 @@
|
||||
"""Test AwaDB functionality."""
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.vectorstores import AwaDB
|
||||
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
||||
|
||||
|
||||
def test_awadb() -> None:
|
||||
"""Test end to end construction and search."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
docsearch = AwaDB.from_texts(
|
||||
table_name="test_awadb", texts=texts, embedding=FakeEmbeddings()
|
||||
)
|
||||
output = docsearch.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo")]
|
||||
|
||||
|
||||
def test_awadb_with_metadatas() -> None:
|
||||
"""Test end to end construction and search."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": i} for i in range(len(texts))]
|
||||
docsearch = AwaDB.from_texts(
|
||||
table_name="test_awadb",
|
||||
texts=texts,
|
||||
embedding=FakeEmbeddings(),
|
||||
metadatas=metadatas,
|
||||
)
|
||||
output = docsearch.similarity_search("foo", k=1)
|
||||
assert output == [Document(page_content="foo", metadata={"page": 0})]
|
||||
|
||||
|
||||
def test_awadb_with_metadatas_with_scores() -> None:
|
||||
"""Test end to end construction and scored search."""
|
||||
texts = ["foo", "bar", "baz"]
|
||||
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
||||
docsearch = AwaDB.from_texts(
|
||||
table_name="test_awadb",
|
||||
texts=texts,
|
||||
embedding=FakeEmbeddings(),
|
||||
metadatas=metadatas,
|
||||
)
|
||||
output = docsearch.similarity_search_with_score("foo", k=1)
|
||||
assert output == [(Document(page_content="foo", metadata={"page": "0"}), 0.0)]
|
||||
|
||||
|
||||
def test_awadb_add_texts() -> None:
|
||||
"""Test end to end adding of texts."""
|
||||
# Create initial doc store.
|
||||
texts = ["foo", "bar", "baz"]
|
||||
docsearch = AwaDB.from_texts(
|
||||
table_name="test_awadb", texts=texts, embedding=FakeEmbeddings()
|
||||
)
|
||||
# Test adding a similar document as before.
|
||||
docsearch.add_texts(["foo"])
|
||||
output = docsearch.similarity_search("foo", k=2)
|
||||
assert output == [Document(page_content="foo"), Document(page_content="foo")]
|
Loading…
Reference in New Issue