ElasticVectorSearch: Add in vector search backed by Elastic (#67)

![image](https://user-images.githubusercontent.com/6690839/200147455-33a68e20-c3c0-4045-9bff-598b38ae8fb2.png)

woo!

Co-authored-by: Harrison Chase <hw.chase.17@gmail.com>
This commit is contained in:
Samantha Whitmore 2022-11-08 07:01:42 -08:00 committed by GitHub
parent efbc03bda8
commit e48e562ea5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 221 additions and 4 deletions

View File

@ -47,6 +47,11 @@ The following use cases require specific installs and api keys:
- Install requirements with `pip install playwright`
- _Wikipedia_:
- Install requirements with `pip install wikipedia`
- _Elasticsearch_:
- Install requirements with `pip install elasticsearch`
- Set up Elasticsearch backend. If you want to do locally, [this](https://www.elastic.co/guide/en/elasticsearch/reference/7.17/getting-started.html) is a good guide.
- _FAISS_:
- Install requirements with `pip install faiss` for Python 3.7 and `pip install faiss-cpu` for Python 3.10+.
## 🚀 What can I do with this
@ -98,6 +103,28 @@ question = "What NFL team won the Super Bowl in the year Justin Beiber was born?
llm_chain.predict(question=question)
```
**Embed & Search Documents**
We support two vector databases to store and search embeddings -- FAISS and Elasticsearch. Here's a code snippet showing how to use FAISS to store embeddings and search for text similar to a query. Both database backends are featured in this [example notebook] (https://github.com/hwchase17/langchain/blob/master/notebooks/examples/embeddings.ipynb).
```
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.faiss import FAISS
from langchain.text_splitter import CharacterTextSplitter
with open('state_of_the_union.txt') as f:
state_of_the_union = f.read()
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_text(state_of_the_union)
embeddings = OpenAIEmbeddings()
docsearch = FAISS.from_texts(texts, embeddings)
query = "What did the president say about Ketanji Brown Jackson"
docs = docsearch.similarity_search(query)
```
## 📖 Documentation
The above examples are probably the most user friendly documentation that exists,

View File

@ -8,6 +8,7 @@
"outputs": [],
"source": [
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.elastic_vector_search import ElasticVectorSearch\n",
"from langchain.faiss import FAISS\n",
"from langchain.text_splitter import CharacterTextSplitter"
]
@ -24,8 +25,7 @@
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"texts = text_splitter.split_text(state_of_the_union)\n",
"\n",
"embeddings = OpenAIEmbeddings()\n",
"docsearch = FAISS.from_texts(texts, embeddings)"
"embeddings = OpenAIEmbeddings()"
]
},
{
@ -35,6 +35,8 @@
"metadata": {},
"outputs": [],
"source": [
"docsearch = FAISS.from_texts(texts, embeddings)\n",
"\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(query)"
]
@ -65,10 +67,49 @@
"print(docs[0].page_content)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "4906b8a3",
"metadata": {},
"outputs": [],
"source": [
"docsearch = ElasticVectorSearch.from_texts(\"http://localhost:9200\", texts, embeddings)\n",
"\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(query)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "95f9eee9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tonight, Id like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n",
"\n",
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
"\n",
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence. \n",
"\n",
"A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since shes been nominated, shes received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n",
"\n",
"And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n"
]
}
],
"source": [
"print(docs[0].page_content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "25500fa6",
"id": "70a253c4",
"metadata": {},
"outputs": [],
"source": []
@ -90,7 +131,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.6"
"version": "3.10.4"
}
},
"nbformat": 4,

View File

@ -16,6 +16,7 @@ from langchain.chains import (
SQLDatabaseChain,
)
from langchain.docstore import Wikipedia
from langchain.elastic_vector_search import ElasticVectorSearch
from langchain.faiss import FAISS
from langchain.llms import Cohere, HuggingFaceHub, OpenAI
from langchain.prompts import BasePrompt, DynamicPrompt, Prompt
@ -39,4 +40,5 @@ __all__ = [
"SQLDatabaseChain",
"FAISS",
"MRKLChain",
"ElasticVectorSearch",
]

View File

@ -0,0 +1,146 @@
"""Wrapper around Elasticsearch vector database."""
import uuid
from typing import Callable, Dict, List
from langchain.docstore.document import Document
from langchain.embeddings.base import Embeddings
def _default_text_mapping(dim: int) -> Dict:
return {
"properties": {
"text": {"type": "text"},
"vector": {"type": "dense_vector", "dims": dim},
}
}
def _default_script_query(query_vector: List[int]) -> Dict:
return {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "cosineSimilarity(params.query_vector, 'vector') + 1.0",
"params": {"query_vector": query_vector},
},
}
}
class ElasticVectorSearch:
"""Wrapper around Elasticsearch as a vector database.
Example:
.. code-block:: python
from langchain import ElasticVectorSearch
elastic_vector_search = ElasticVectorSearch(
"http://localhost:9200",
"embeddings",
mapping,
embedding_function
)
"""
def __init__(
self,
elastic_url: str,
index_name: str,
mapping: Dict,
embedding_function: Callable,
):
"""Initialize with necessary components."""
try:
import elasticsearch
except ImportError:
raise ValueError(
"Could not import elasticsearch python packge. "
"Please install it with `pip install elasticearch`."
)
self.embedding_function = embedding_function
self.index_name = index_name
try:
es_client = elasticsearch.Elasticsearch(elastic_url) # noqa
except ValueError as e:
raise ValueError(
"Your elasticsearch client string is misformatted. " f"Got error: {e} "
)
self.client = es_client
self.mapping = mapping
def similarity_search(self, query: str, k: int = 4) -> List[Document]:
"""Return docs most similar to query.
Args:
query: Text to look up documents similar to.
k: Number of Documents to return. Defaults to 4.
Returns:
List of Documents most similar to the query.
"""
embedding = self.embedding_function(query)
script_query = _default_script_query(embedding)
response = self.client.search(index=self.index_name, query=script_query)
texts = [hit["_source"]["text"] for hit in response["hits"]["hits"][:k]]
documents = [Document(page_content=text) for text in texts]
return documents
@classmethod
def from_texts(
cls, elastic_url: str, texts: List[str], embedding: Embeddings
) -> "ElasticVectorSearch":
"""Construct ElasticVectorSearch wrapper from raw documents.
This is a user friendly interface that:
1. Embeds documents.
2. Creates a new index for the embeddings in the Elasticsearch instance.
3. Adds the documents to the newly created Elasticsearch index.
This is intended to be a quick way to get started.
Example:
.. code-block:: python
from langchain import ElasticVectorSearch
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()
elastic_vector_search = ElasticVectorSearch.from_texts(
"http://localhost:9200",
texts,
embeddings
)
"""
try:
import elasticsearch
from elasticsearch.helpers import bulk
except ImportError:
raise ValueError(
"Could not import elasticsearch python packge. "
"Please install it with `pip install elasticearch`."
)
try:
client = elasticsearch.Elasticsearch(elastic_url)
except ValueError as e:
raise ValueError(
"Your elasticsearch client string is misformatted. " f"Got error: {e} "
)
index_name = uuid.uuid4().hex
embeddings = embedding.embed_documents(texts)
dim = len(embeddings[0])
mapping = _default_text_mapping(dim)
# TODO would be nice to create index before embedding,
# just to save expensive steps for last
client.indices.create(index=index_name, mappings=mapping)
requests = []
for i, text in enumerate(texts):
request = {
"_op_type": "index",
"_index": index_name,
"vector": embeddings[i],
"text": text,
}
requests.append(request)
bulk(client, requests)
client.indices.refresh(index=index_name)
return cls(elastic_url, index_name, mapping, embedding.embed_query)

View File

@ -1,6 +1,7 @@
-r test_requirements.txt
# For integrations
cohere
elasticsearch
openai
google-search-results
nlpcloud