mirror of
https://github.com/hwchase17/langchain
synced 2024-11-04 06:00:26 +00:00
ed58eeb9c5
Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
204 lines
6.8 KiB
Python
204 lines
6.8 KiB
Python
from abc import ABC
|
|
from typing import TYPE_CHECKING, Any, Iterable, List, Optional, Tuple, Type
|
|
|
|
import numpy as np
|
|
from langchain_core.documents import Document
|
|
from langchain_core.embeddings import Embeddings
|
|
from langchain_core.pydantic_v1 import Field
|
|
from langchain_core.vectorstores import VectorStore
|
|
|
|
from langchain_community.vectorstores.utils import maximal_marginal_relevance
|
|
|
|
if TYPE_CHECKING:
|
|
from docarray import BaseDoc
|
|
from docarray.index.abstract import BaseDocIndex
|
|
|
|
|
|
def _check_docarray_import() -> None:
|
|
try:
|
|
import docarray
|
|
|
|
da_version = docarray.__version__.split(".")
|
|
if int(da_version[0]) == 0 and int(da_version[1]) <= 31:
|
|
raise ImportError(
|
|
f"To use the DocArrayHnswSearch VectorStore the docarray "
|
|
f"version >=0.32.0 is expected, received: {docarray.__version__}."
|
|
f"To upgrade, please run: `pip install -U docarray`."
|
|
)
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Could not import docarray python package. "
|
|
'Please install it with `pip install "langchain[docarray]"`.'
|
|
)
|
|
|
|
|
|
class DocArrayIndex(VectorStore, ABC):
|
|
"""Base class for `DocArray` based vector stores."""
|
|
|
|
def __init__(
|
|
self,
|
|
doc_index: "BaseDocIndex",
|
|
embedding: Embeddings,
|
|
):
|
|
"""Initialize a vector store from DocArray's DocIndex."""
|
|
self.doc_index = doc_index
|
|
self.embedding = embedding
|
|
|
|
@staticmethod
|
|
def _get_doc_cls(**embeddings_params: Any) -> Type["BaseDoc"]:
|
|
"""Get docarray Document class describing the schema of DocIndex."""
|
|
from docarray import BaseDoc
|
|
from docarray.typing import NdArray
|
|
|
|
class DocArrayDoc(BaseDoc):
|
|
text: Optional[str]
|
|
embedding: Optional[NdArray] = Field(**embeddings_params)
|
|
metadata: Optional[dict]
|
|
|
|
return DocArrayDoc
|
|
|
|
@property
|
|
def doc_cls(self) -> Type["BaseDoc"]:
|
|
if self.doc_index._schema is None:
|
|
raise ValueError("doc_index expected to have non-null _schema attribute.")
|
|
return self.doc_index._schema
|
|
|
|
def add_texts(
|
|
self,
|
|
texts: Iterable[str],
|
|
metadatas: Optional[List[dict]] = None,
|
|
**kwargs: Any,
|
|
) -> List[str]:
|
|
"""Embed texts and add to the vector store.
|
|
|
|
Args:
|
|
texts: Iterable of strings to add to the vectorstore.
|
|
metadatas: Optional list of metadatas associated with the texts.
|
|
|
|
Returns:
|
|
List of ids from adding the texts into the vectorstore.
|
|
"""
|
|
ids: List[str] = []
|
|
embeddings = self.embedding.embed_documents(list(texts))
|
|
for i, (t, e) in enumerate(zip(texts, embeddings)):
|
|
m = metadatas[i] if metadatas else {}
|
|
doc = self.doc_cls(text=t, embedding=e, metadata=m)
|
|
self.doc_index.index([doc])
|
|
ids.append(str(doc.id))
|
|
|
|
return ids
|
|
|
|
def similarity_search_with_score(
|
|
self, query: str, k: int = 4, **kwargs: Any
|
|
) -> List[Tuple[Document, float]]:
|
|
"""Return docs most similar to query.
|
|
|
|
Args:
|
|
query: Text to look up documents similar to.
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
Returns:
|
|
List of documents most similar to the query text and
|
|
cosine distance in float for each.
|
|
Lower score represents more similarity.
|
|
"""
|
|
query_embedding = self.embedding.embed_query(query)
|
|
query_doc = self.doc_cls(embedding=query_embedding) # type: ignore
|
|
docs, scores = self.doc_index.find(query_doc, search_field="embedding", limit=k)
|
|
|
|
result = [
|
|
(Document(page_content=doc.text, metadata=doc.metadata), score)
|
|
for doc, score in zip(docs, scores)
|
|
]
|
|
return result
|
|
|
|
def similarity_search(
|
|
self, query: str, k: int = 4, **kwargs: Any
|
|
) -> List[Document]:
|
|
"""Return docs most similar to query.
|
|
|
|
Args:
|
|
query: Text to look up documents similar to.
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
Returns:
|
|
List of Documents most similar to the query.
|
|
"""
|
|
results = self.similarity_search_with_score(query, k=k, **kwargs)
|
|
return [doc for doc, _ in results]
|
|
|
|
def _similarity_search_with_relevance_scores(
|
|
self,
|
|
query: str,
|
|
k: int = 4,
|
|
**kwargs: Any,
|
|
) -> List[Tuple[Document, float]]:
|
|
"""Return docs and relevance scores, normalized on a scale from 0 to 1.
|
|
|
|
0 is dissimilar, 1 is most similar.
|
|
"""
|
|
raise NotImplementedError()
|
|
|
|
def similarity_search_by_vector(
|
|
self, embedding: List[float], k: int = 4, **kwargs: Any
|
|
) -> List[Document]:
|
|
"""Return docs most similar to embedding vector.
|
|
|
|
Args:
|
|
embedding: Embedding to look up documents similar to.
|
|
k: Number of Documents to return. Defaults to 4.
|
|
|
|
Returns:
|
|
List of Documents most similar to the query vector.
|
|
"""
|
|
|
|
query_doc = self.doc_cls(embedding=embedding) # type: ignore
|
|
docs = self.doc_index.find(
|
|
query_doc, search_field="embedding", limit=k
|
|
).documents
|
|
|
|
result = [
|
|
Document(page_content=doc.text, metadata=doc.metadata) for doc in docs
|
|
]
|
|
return result
|
|
|
|
def max_marginal_relevance_search(
|
|
self,
|
|
query: str,
|
|
k: int = 4,
|
|
fetch_k: int = 20,
|
|
lambda_mult: float = 0.5,
|
|
**kwargs: Any,
|
|
) -> List[Document]:
|
|
"""Return docs selected using the maximal marginal relevance.
|
|
|
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
|
among selected documents.
|
|
|
|
Args:
|
|
query: Text to look up documents similar to.
|
|
k: Number of Documents to return. Defaults to 4.
|
|
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
|
lambda_mult: Number between 0 and 1 that determines the degree
|
|
of diversity among the results with 0 corresponding
|
|
to maximum diversity and 1 to minimum diversity.
|
|
Defaults to 0.5.
|
|
Returns:
|
|
List of Documents selected by maximal marginal relevance.
|
|
"""
|
|
query_embedding = self.embedding.embed_query(query)
|
|
query_doc = self.doc_cls(embedding=query_embedding) # type: ignore
|
|
|
|
docs = self.doc_index.find(
|
|
query_doc, search_field="embedding", limit=fetch_k
|
|
).documents
|
|
|
|
mmr_selected = maximal_marginal_relevance(
|
|
np.array(query_embedding), docs.embedding, k=k
|
|
)
|
|
results = [
|
|
Document(page_content=docs[idx].text, metadata=docs[idx].metadata)
|
|
for idx in mmr_selected
|
|
]
|
|
return results
|