mirror of
https://github.com/hwchase17/langchain
synced 2024-11-02 09:40:22 +00:00
177 lines
5.8 KiB
Python
177 lines
5.8 KiB
Python
|
from __future__ import annotations
|
||
|
|
||
|
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||
|
|
||
|
import numpy as np
|
||
|
from langchain_core.documents import Document
|
||
|
from langchain_core.embeddings import Embeddings
|
||
|
from langchain_core.vectorstores import VectorStore
|
||
|
|
||
|
from langchain_community.docstore.base import AddableMixin, Docstore
|
||
|
from langchain_community.docstore.in_memory import InMemoryDocstore
|
||
|
|
||
|
|
||
|
def dependable_usearch_import() -> Any:
|
||
|
"""
|
||
|
Import usearch if available, otherwise raise error.
|
||
|
"""
|
||
|
try:
|
||
|
import usearch.index
|
||
|
except ImportError:
|
||
|
raise ImportError(
|
||
|
"Could not import usearch python package. "
|
||
|
"Please install it with `pip install usearch` "
|
||
|
)
|
||
|
return usearch.index
|
||
|
|
||
|
|
||
|
class USearch(VectorStore):
|
||
|
"""`USearch` vector store.
|
||
|
|
||
|
To use, you should have the ``usearch`` python package installed.
|
||
|
"""
|
||
|
|
||
|
def __init__(
|
||
|
self,
|
||
|
embedding: Embeddings,
|
||
|
index: Any,
|
||
|
docstore: Docstore,
|
||
|
ids: List[str],
|
||
|
):
|
||
|
"""Initialize with necessary components."""
|
||
|
self.embedding = embedding
|
||
|
self.index = index
|
||
|
self.docstore = docstore
|
||
|
self.ids = ids
|
||
|
|
||
|
def add_texts(
|
||
|
self,
|
||
|
texts: Iterable[str],
|
||
|
metadatas: Optional[List[Dict]] = None,
|
||
|
ids: Optional[np.ndarray] = None,
|
||
|
**kwargs: Any,
|
||
|
) -> List[str]:
|
||
|
"""Run more texts through the embeddings and add to the vectorstore.
|
||
|
|
||
|
Args:
|
||
|
texts: Iterable of strings to add to the vectorstore.
|
||
|
metadatas: Optional list of metadatas associated with the texts.
|
||
|
ids: Optional list of unique IDs.
|
||
|
|
||
|
Returns:
|
||
|
List of ids from adding the texts into the vectorstore.
|
||
|
"""
|
||
|
if not isinstance(self.docstore, AddableMixin):
|
||
|
raise ValueError(
|
||
|
"If trying to add texts, the underlying docstore should support "
|
||
|
f"adding items, which {self.docstore} does not"
|
||
|
)
|
||
|
|
||
|
embeddings = self.embedding.embed_documents(list(texts))
|
||
|
documents = []
|
||
|
for i, text in enumerate(texts):
|
||
|
metadata = metadatas[i] if metadatas else {}
|
||
|
documents.append(Document(page_content=text, metadata=metadata))
|
||
|
last_id = int(self.ids[-1]) + 1
|
||
|
if ids is None:
|
||
|
ids = np.array([str(last_id + id) for id, _ in enumerate(texts)])
|
||
|
|
||
|
self.index.add(np.array(ids), np.array(embeddings))
|
||
|
self.docstore.add(dict(zip(ids, documents)))
|
||
|
self.ids.extend(ids)
|
||
|
return ids.tolist()
|
||
|
|
||
|
def similarity_search_with_score(
|
||
|
self,
|
||
|
query: str,
|
||
|
k: int = 4,
|
||
|
) -> List[Tuple[Document, float]]:
|
||
|
"""Return docs most similar to query.
|
||
|
|
||
|
Args:
|
||
|
query: Text to look up documents similar to.
|
||
|
k: Number of Documents to return. Defaults to 4.
|
||
|
|
||
|
Returns:
|
||
|
List of documents most similar to the query with distance.
|
||
|
"""
|
||
|
query_embedding = self.embedding.embed_query(query)
|
||
|
matches = self.index.search(np.array(query_embedding), k)
|
||
|
|
||
|
docs_with_scores: List[Tuple[Document, float]] = []
|
||
|
for id, score in zip(matches.keys, matches.distances):
|
||
|
doc = self.docstore.search(str(id))
|
||
|
if not isinstance(doc, Document):
|
||
|
raise ValueError(f"Could not find document for id {id}, got {doc}")
|
||
|
docs_with_scores.append((doc, score))
|
||
|
|
||
|
return docs_with_scores
|
||
|
|
||
|
def similarity_search(
|
||
|
self,
|
||
|
query: str,
|
||
|
k: int = 4,
|
||
|
**kwargs: Any,
|
||
|
) -> List[Document]:
|
||
|
"""Return docs most similar to query.
|
||
|
|
||
|
Args:
|
||
|
query: Text to look up documents similar to.
|
||
|
k: Number of Documents to return. Defaults to 4.
|
||
|
|
||
|
Returns:
|
||
|
List of Documents most similar to the query.
|
||
|
"""
|
||
|
query_embedding = self.embedding.embed_query(query)
|
||
|
matches = self.index.search(np.array(query_embedding), k)
|
||
|
|
||
|
docs: List[Document] = []
|
||
|
for id in matches.keys:
|
||
|
doc = self.docstore.search(str(id))
|
||
|
if not isinstance(doc, Document):
|
||
|
raise ValueError(f"Could not find document for id {id}, got {doc}")
|
||
|
docs.append(doc)
|
||
|
|
||
|
return docs
|
||
|
|
||
|
@classmethod
|
||
|
def from_texts(
|
||
|
cls,
|
||
|
texts: List[str],
|
||
|
embedding: Embeddings,
|
||
|
metadatas: Optional[List[Dict]] = None,
|
||
|
ids: Optional[np.ndarray] = None,
|
||
|
metric: str = "cos",
|
||
|
**kwargs: Any,
|
||
|
) -> USearch:
|
||
|
"""Construct USearch wrapper from raw documents.
|
||
|
This is a user friendly interface that:
|
||
|
1. Embeds documents.
|
||
|
2. Creates an in memory docstore
|
||
|
3. Initializes the USearch database
|
||
|
This is intended to be a quick way to get started.
|
||
|
|
||
|
Example:
|
||
|
.. code-block:: python
|
||
|
|
||
|
from langchain_community.vectorstores import USearch
|
||
|
from langchain_community.embeddings import OpenAIEmbeddings
|
||
|
|
||
|
embeddings = OpenAIEmbeddings()
|
||
|
usearch = USearch.from_texts(texts, embeddings)
|
||
|
"""
|
||
|
embeddings = embedding.embed_documents(texts)
|
||
|
|
||
|
documents: List[Document] = []
|
||
|
if ids is None:
|
||
|
ids = np.array([str(id) for id, _ in enumerate(texts)])
|
||
|
for i, text in enumerate(texts):
|
||
|
metadata = metadatas[i] if metadatas else {}
|
||
|
documents.append(Document(page_content=text, metadata=metadata))
|
||
|
|
||
|
docstore = InMemoryDocstore(dict(zip(ids, documents)))
|
||
|
usearch = dependable_usearch_import()
|
||
|
index = usearch.Index(ndim=len(embeddings[0]), metric=metric)
|
||
|
index.add(np.array(ids), np.array(embeddings))
|
||
|
return cls(embedding, index, docstore, ids.tolist())
|