mirror of
https://github.com/hwchase17/langchain
synced 2024-11-11 19:11:02 +00:00
50abeb7ed9
### Description Fixed a small bug in chroma.py add_images(), previously whenever we are not passing metadata the documents is containing the base64 of the uris passed, but when we are passing the metadata the documents is containing normal string uris which should not be the case. ### Issue In add_images() method when we are calling upsert() we have to use "b64_texts" instead of normal string "uris". ### Twitter handle https://twitter.com/whitepegasus01
798 lines
30 KiB
Python
798 lines
30 KiB
Python
from __future__ import annotations
|
|
|
|
import base64
|
|
import logging
|
|
import uuid
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
Any,
|
|
Callable,
|
|
Dict,
|
|
Iterable,
|
|
List,
|
|
Optional,
|
|
Tuple,
|
|
Type,
|
|
)
|
|
|
|
import numpy as np
|
|
from langchain_core.documents import Document
|
|
from langchain_core.embeddings import Embeddings
|
|
from langchain_core.utils import xor_args
|
|
from langchain_core.vectorstores import VectorStore
|
|
|
|
from langchain_community.vectorstores.utils import maximal_marginal_relevance
|
|
|
|
if TYPE_CHECKING:
|
|
import chromadb
|
|
import chromadb.config
|
|
from chromadb.api.types import ID, OneOrMany, Where, WhereDocument
|
|
|
|
logger = logging.getLogger()
|
|
DEFAULT_K = 4 # Number of Documents to return.
|
|
|
|
|
|
def _results_to_docs(results: Any) -> List[Document]:
|
|
return [doc for doc, _ in _results_to_docs_and_scores(results)]
|
|
|
|
|
|
def _results_to_docs_and_scores(results: Any) -> List[Tuple[Document, float]]:
|
|
return [
|
|
# TODO: Chroma can do batch querying,
|
|
# we shouldn't hard code to the 1st result
|
|
(Document(page_content=result[0], metadata=result[1] or {}), result[2])
|
|
for result in zip(
|
|
results["documents"][0],
|
|
results["metadatas"][0],
|
|
results["distances"][0],
|
|
)
|
|
]
|
|
|
|
|
|
class Chroma(VectorStore):
|
|
"""`ChromaDB` vector store.
|
|
|
|
To use, you should have the ``chromadb`` python package installed.
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
from langchain_community.vectorstores import Chroma
|
|
from langchain_community.embeddings.openai import OpenAIEmbeddings
|
|
|
|
embeddings = OpenAIEmbeddings()
|
|
vectorstore = Chroma("langchain_store", embeddings)
|
|
"""
|
|
|
|
_LANGCHAIN_DEFAULT_COLLECTION_NAME = "langchain"
|
|
|
|
def __init__(
|
|
self,
|
|
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
|
|
embedding_function: Optional[Embeddings] = None,
|
|
persist_directory: Optional[str] = None,
|
|
client_settings: Optional[chromadb.config.Settings] = None,
|
|
collection_metadata: Optional[Dict] = None,
|
|
client: Optional[chromadb.Client] = None,
|
|
relevance_score_fn: Optional[Callable[[float], float]] = None,
|
|
) -> None:
|
|
"""Initialize with a Chroma client."""
|
|
try:
|
|
import chromadb
|
|
import chromadb.config
|
|
except ImportError:
|
|
raise ImportError(
|
|
"Could not import chromadb python package. "
|
|
"Please install it with `pip install chromadb`."
|
|
)
|
|
|
|
if client is not None:
|
|
self._client_settings = client_settings
|
|
self._client = client
|
|
self._persist_directory = persist_directory
|
|
else:
|
|
if client_settings:
|
|
# If client_settings is provided with persist_directory specified,
|
|
# then it is "in-memory and persisting to disk" mode.
|
|
client_settings.persist_directory = (
|
|
persist_directory or client_settings.persist_directory
|
|
)
|
|
if client_settings.persist_directory is not None:
|
|
# Maintain backwards compatibility with chromadb < 0.4.0
|
|
major, minor, _ = chromadb.__version__.split(".")
|
|
if int(major) == 0 and int(minor) < 4:
|
|
client_settings.chroma_db_impl = "duckdb+parquet"
|
|
|
|
_client_settings = client_settings
|
|
elif persist_directory:
|
|
# Maintain backwards compatibility with chromadb < 0.4.0
|
|
major, minor, _ = chromadb.__version__.split(".")
|
|
if int(major) == 0 and int(minor) < 4:
|
|
_client_settings = chromadb.config.Settings(
|
|
chroma_db_impl="duckdb+parquet",
|
|
)
|
|
else:
|
|
_client_settings = chromadb.config.Settings(is_persistent=True)
|
|
_client_settings.persist_directory = persist_directory
|
|
else:
|
|
_client_settings = chromadb.config.Settings()
|
|
self._client_settings = _client_settings
|
|
self._client = chromadb.Client(_client_settings)
|
|
self._persist_directory = (
|
|
_client_settings.persist_directory or persist_directory
|
|
)
|
|
|
|
self._embedding_function = embedding_function
|
|
self._collection = self._client.get_or_create_collection(
|
|
name=collection_name,
|
|
embedding_function=None,
|
|
metadata=collection_metadata,
|
|
)
|
|
self.override_relevance_score_fn = relevance_score_fn
|
|
|
|
@property
|
|
def embeddings(self) -> Optional[Embeddings]:
|
|
return self._embedding_function
|
|
|
|
@xor_args(("query_texts", "query_embeddings"))
|
|
def __query_collection(
|
|
self,
|
|
query_texts: Optional[List[str]] = None,
|
|
query_embeddings: Optional[List[List[float]]] = None,
|
|
n_results: int = 4,
|
|
where: Optional[Dict[str, str]] = None,
|
|
where_document: Optional[Dict[str, str]] = None,
|
|
**kwargs: Any,
|
|
) -> List[Document]:
|
|
"""Query the chroma collection."""
|
|
try:
|
|
import chromadb # noqa: F401
|
|
except ImportError:
|
|
raise ValueError(
|
|
"Could not import chromadb python package. "
|
|
"Please install it with `pip install chromadb`."
|
|
)
|
|
return self._collection.query(
|
|
query_texts=query_texts,
|
|
query_embeddings=query_embeddings,
|
|
n_results=n_results,
|
|
where=where,
|
|
where_document=where_document,
|
|
**kwargs,
|
|
)
|
|
|
|
def encode_image(self, uri: str) -> str:
|
|
"""Get base64 string from image URI."""
|
|
with open(uri, "rb") as image_file:
|
|
return base64.b64encode(image_file.read()).decode("utf-8")
|
|
|
|
def add_images(
|
|
self,
|
|
uris: List[str],
|
|
metadatas: Optional[List[dict]] = None,
|
|
ids: Optional[List[str]] = None,
|
|
**kwargs: Any,
|
|
) -> List[str]:
|
|
"""Run more images through the embeddings and add to the vectorstore.
|
|
|
|
Args:
|
|
uris List[str]: File path to the image.
|
|
metadatas (Optional[List[dict]], optional): Optional list of metadatas.
|
|
ids (Optional[List[str]], optional): Optional list of IDs.
|
|
|
|
Returns:
|
|
List[str]: List of IDs of the added images.
|
|
"""
|
|
# Map from uris to b64 encoded strings
|
|
b64_texts = [self.encode_image(uri=uri) for uri in uris]
|
|
# Populate IDs
|
|
if ids is None:
|
|
ids = [str(uuid.uuid1()) for _ in uris]
|
|
embeddings = None
|
|
# Set embeddings
|
|
if self._embedding_function is not None and hasattr(
|
|
self._embedding_function, "embed_image"
|
|
):
|
|
embeddings = self._embedding_function.embed_image(uris=uris)
|
|
if metadatas:
|
|
# fill metadatas with empty dicts if somebody
|
|
# did not specify metadata for all images
|
|
length_diff = len(uris) - len(metadatas)
|
|
if length_diff:
|
|
metadatas = metadatas + [{}] * length_diff
|
|
empty_ids = []
|
|
non_empty_ids = []
|
|
for idx, m in enumerate(metadatas):
|
|
if m:
|
|
non_empty_ids.append(idx)
|
|
else:
|
|
empty_ids.append(idx)
|
|
if non_empty_ids:
|
|
metadatas = [metadatas[idx] for idx in non_empty_ids]
|
|
images_with_metadatas = [b64_texts[idx] for idx in non_empty_ids]
|
|
embeddings_with_metadatas = (
|
|
[embeddings[idx] for idx in non_empty_ids] if embeddings else None
|
|
)
|
|
ids_with_metadata = [ids[idx] for idx in non_empty_ids]
|
|
try:
|
|
self._collection.upsert(
|
|
metadatas=metadatas,
|
|
embeddings=embeddings_with_metadatas,
|
|
documents=images_with_metadatas,
|
|
ids=ids_with_metadata,
|
|
)
|
|
except ValueError as e:
|
|
if "Expected metadata value to be" in str(e):
|
|
msg = (
|
|
"Try filtering complex metadata using "
|
|
"langchain_community.vectorstores.utils.filter_complex_metadata."
|
|
)
|
|
raise ValueError(e.args[0] + "\n\n" + msg)
|
|
else:
|
|
raise e
|
|
if empty_ids:
|
|
images_without_metadatas = [b64_texts[j] for j in empty_ids]
|
|
embeddings_without_metadatas = (
|
|
[embeddings[j] for j in empty_ids] if embeddings else None
|
|
)
|
|
ids_without_metadatas = [ids[j] for j in empty_ids]
|
|
self._collection.upsert(
|
|
embeddings=embeddings_without_metadatas,
|
|
documents=images_without_metadatas,
|
|
ids=ids_without_metadatas,
|
|
)
|
|
else:
|
|
self._collection.upsert(
|
|
embeddings=embeddings,
|
|
documents=b64_texts,
|
|
ids=ids,
|
|
)
|
|
return ids
|
|
|
|
def add_texts(
|
|
self,
|
|
texts: Iterable[str],
|
|
metadatas: Optional[List[dict]] = None,
|
|
ids: Optional[List[str]] = None,
|
|
**kwargs: Any,
|
|
) -> List[str]:
|
|
"""Run more texts through the embeddings and add to the vectorstore.
|
|
|
|
Args:
|
|
texts (Iterable[str]): Texts to add to the vectorstore.
|
|
metadatas (Optional[List[dict]], optional): Optional list of metadatas.
|
|
ids (Optional[List[str]], optional): Optional list of IDs.
|
|
|
|
Returns:
|
|
List[str]: List of IDs of the added texts.
|
|
"""
|
|
# TODO: Handle the case where the user doesn't provide ids on the Collection
|
|
if ids is None:
|
|
ids = [str(uuid.uuid1()) for _ in texts]
|
|
embeddings = None
|
|
texts = list(texts)
|
|
if self._embedding_function is not None:
|
|
embeddings = self._embedding_function.embed_documents(texts)
|
|
if metadatas:
|
|
# fill metadatas with empty dicts if somebody
|
|
# did not specify metadata for all texts
|
|
length_diff = len(texts) - len(metadatas)
|
|
if length_diff:
|
|
metadatas = metadatas + [{}] * length_diff
|
|
empty_ids = []
|
|
non_empty_ids = []
|
|
for idx, m in enumerate(metadatas):
|
|
if m:
|
|
non_empty_ids.append(idx)
|
|
else:
|
|
empty_ids.append(idx)
|
|
if non_empty_ids:
|
|
metadatas = [metadatas[idx] for idx in non_empty_ids]
|
|
texts_with_metadatas = [texts[idx] for idx in non_empty_ids]
|
|
embeddings_with_metadatas = (
|
|
[embeddings[idx] for idx in non_empty_ids] if embeddings else None
|
|
)
|
|
ids_with_metadata = [ids[idx] for idx in non_empty_ids]
|
|
try:
|
|
self._collection.upsert(
|
|
metadatas=metadatas,
|
|
embeddings=embeddings_with_metadatas,
|
|
documents=texts_with_metadatas,
|
|
ids=ids_with_metadata,
|
|
)
|
|
except ValueError as e:
|
|
if "Expected metadata value to be" in str(e):
|
|
msg = (
|
|
"Try filtering complex metadata from the document using "
|
|
"langchain_community.vectorstores.utils.filter_complex_metadata."
|
|
)
|
|
raise ValueError(e.args[0] + "\n\n" + msg)
|
|
else:
|
|
raise e
|
|
if empty_ids:
|
|
texts_without_metadatas = [texts[j] for j in empty_ids]
|
|
embeddings_without_metadatas = (
|
|
[embeddings[j] for j in empty_ids] if embeddings else None
|
|
)
|
|
ids_without_metadatas = [ids[j] for j in empty_ids]
|
|
self._collection.upsert(
|
|
embeddings=embeddings_without_metadatas,
|
|
documents=texts_without_metadatas,
|
|
ids=ids_without_metadatas,
|
|
)
|
|
else:
|
|
self._collection.upsert(
|
|
embeddings=embeddings,
|
|
documents=texts,
|
|
ids=ids,
|
|
)
|
|
return ids
|
|
|
|
def similarity_search(
|
|
self,
|
|
query: str,
|
|
k: int = DEFAULT_K,
|
|
filter: Optional[Dict[str, str]] = None,
|
|
**kwargs: Any,
|
|
) -> List[Document]:
|
|
"""Run similarity search with Chroma.
|
|
|
|
Args:
|
|
query (str): Query text to search for.
|
|
k (int): Number of results to return. Defaults to 4.
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
|
|
Returns:
|
|
List[Document]: List of documents most similar to the query text.
|
|
"""
|
|
docs_and_scores = self.similarity_search_with_score(
|
|
query, k, filter=filter, **kwargs
|
|
)
|
|
return [doc for doc, _ in docs_and_scores]
|
|
|
|
def similarity_search_by_vector(
|
|
self,
|
|
embedding: List[float],
|
|
k: int = DEFAULT_K,
|
|
filter: Optional[Dict[str, str]] = None,
|
|
where_document: Optional[Dict[str, str]] = None,
|
|
**kwargs: Any,
|
|
) -> List[Document]:
|
|
"""Return docs most similar to embedding vector.
|
|
Args:
|
|
embedding (List[float]): Embedding to look up documents similar to.
|
|
k (int): Number of Documents to return. Defaults to 4.
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
Returns:
|
|
List of Documents most similar to the query vector.
|
|
"""
|
|
results = self.__query_collection(
|
|
query_embeddings=embedding,
|
|
n_results=k,
|
|
where=filter,
|
|
where_document=where_document,
|
|
**kwargs,
|
|
)
|
|
return _results_to_docs(results)
|
|
|
|
def similarity_search_by_vector_with_relevance_scores(
|
|
self,
|
|
embedding: List[float],
|
|
k: int = DEFAULT_K,
|
|
filter: Optional[Dict[str, str]] = None,
|
|
where_document: Optional[Dict[str, str]] = None,
|
|
**kwargs: Any,
|
|
) -> List[Tuple[Document, float]]:
|
|
"""
|
|
Return docs most similar to embedding vector and similarity score.
|
|
|
|
Args:
|
|
embedding (List[float]): Embedding to look up documents similar to.
|
|
k (int): Number of Documents to return. Defaults to 4.
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
|
|
Returns:
|
|
List[Tuple[Document, float]]: List of documents most similar to
|
|
the query text and cosine distance in float for each.
|
|
Lower score represents more similarity.
|
|
"""
|
|
results = self.__query_collection(
|
|
query_embeddings=embedding,
|
|
n_results=k,
|
|
where=filter,
|
|
where_document=where_document,
|
|
**kwargs,
|
|
)
|
|
return _results_to_docs_and_scores(results)
|
|
|
|
def similarity_search_with_score(
|
|
self,
|
|
query: str,
|
|
k: int = DEFAULT_K,
|
|
filter: Optional[Dict[str, str]] = None,
|
|
where_document: Optional[Dict[str, str]] = None,
|
|
**kwargs: Any,
|
|
) -> List[Tuple[Document, float]]:
|
|
"""Run similarity search with Chroma with distance.
|
|
|
|
Args:
|
|
query (str): Query text to search for.
|
|
k (int): Number of results to return. Defaults to 4.
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
|
|
Returns:
|
|
List[Tuple[Document, float]]: List of documents most similar to
|
|
the query text and cosine distance in float for each.
|
|
Lower score represents more similarity.
|
|
"""
|
|
if self._embedding_function is None:
|
|
results = self.__query_collection(
|
|
query_texts=[query],
|
|
n_results=k,
|
|
where=filter,
|
|
where_document=where_document,
|
|
**kwargs,
|
|
)
|
|
else:
|
|
query_embedding = self._embedding_function.embed_query(query)
|
|
results = self.__query_collection(
|
|
query_embeddings=[query_embedding],
|
|
n_results=k,
|
|
where=filter,
|
|
where_document=where_document,
|
|
**kwargs,
|
|
)
|
|
|
|
return _results_to_docs_and_scores(results)
|
|
|
|
def _select_relevance_score_fn(self) -> Callable[[float], float]:
|
|
"""
|
|
The 'correct' relevance function
|
|
may differ depending on a few things, including:
|
|
- the distance / similarity metric used by the VectorStore
|
|
- the scale of your embeddings (OpenAI's are unit normed. Many others are not!)
|
|
- embedding dimensionality
|
|
- etc.
|
|
"""
|
|
if self.override_relevance_score_fn:
|
|
return self.override_relevance_score_fn
|
|
|
|
distance = "l2"
|
|
distance_key = "hnsw:space"
|
|
metadata = self._collection.metadata
|
|
|
|
if metadata and distance_key in metadata:
|
|
distance = metadata[distance_key]
|
|
|
|
if distance == "cosine":
|
|
return self._cosine_relevance_score_fn
|
|
elif distance == "l2":
|
|
return self._euclidean_relevance_score_fn
|
|
elif distance == "ip":
|
|
return self._max_inner_product_relevance_score_fn
|
|
else:
|
|
raise ValueError(
|
|
"No supported normalization function"
|
|
f" for distance metric of type: {distance}."
|
|
"Consider providing relevance_score_fn to Chroma constructor."
|
|
)
|
|
|
|
def max_marginal_relevance_search_by_vector(
|
|
self,
|
|
embedding: List[float],
|
|
k: int = DEFAULT_K,
|
|
fetch_k: int = 20,
|
|
lambda_mult: float = 0.5,
|
|
filter: Optional[Dict[str, str]] = None,
|
|
where_document: Optional[Dict[str, str]] = None,
|
|
**kwargs: Any,
|
|
) -> List[Document]:
|
|
"""Return docs selected using the maximal marginal relevance.
|
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
|
among selected documents.
|
|
|
|
Args:
|
|
embedding: Embedding to look up documents similar to.
|
|
k: Number of Documents to return. Defaults to 4.
|
|
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
|
lambda_mult: Number between 0 and 1 that determines the degree
|
|
of diversity among the results with 0 corresponding
|
|
to maximum diversity and 1 to minimum diversity.
|
|
Defaults to 0.5.
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
|
|
Returns:
|
|
List of Documents selected by maximal marginal relevance.
|
|
"""
|
|
|
|
results = self.__query_collection(
|
|
query_embeddings=embedding,
|
|
n_results=fetch_k,
|
|
where=filter,
|
|
where_document=where_document,
|
|
include=["metadatas", "documents", "distances", "embeddings"],
|
|
**kwargs,
|
|
)
|
|
mmr_selected = maximal_marginal_relevance(
|
|
np.array(embedding, dtype=np.float32),
|
|
results["embeddings"][0],
|
|
k=k,
|
|
lambda_mult=lambda_mult,
|
|
)
|
|
|
|
candidates = _results_to_docs(results)
|
|
|
|
selected_results = [r for i, r in enumerate(candidates) if i in mmr_selected]
|
|
return selected_results
|
|
|
|
def max_marginal_relevance_search(
|
|
self,
|
|
query: str,
|
|
k: int = DEFAULT_K,
|
|
fetch_k: int = 20,
|
|
lambda_mult: float = 0.5,
|
|
filter: Optional[Dict[str, str]] = None,
|
|
where_document: Optional[Dict[str, str]] = None,
|
|
**kwargs: Any,
|
|
) -> List[Document]:
|
|
"""Return docs selected using the maximal marginal relevance.
|
|
Maximal marginal relevance optimizes for similarity to query AND diversity
|
|
among selected documents.
|
|
|
|
Args:
|
|
query: Text to look up documents similar to.
|
|
k: Number of Documents to return. Defaults to 4.
|
|
fetch_k: Number of Documents to fetch to pass to MMR algorithm.
|
|
lambda_mult: Number between 0 and 1 that determines the degree
|
|
of diversity among the results with 0 corresponding
|
|
to maximum diversity and 1 to minimum diversity.
|
|
Defaults to 0.5.
|
|
filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None.
|
|
|
|
Returns:
|
|
List of Documents selected by maximal marginal relevance.
|
|
"""
|
|
if self._embedding_function is None:
|
|
raise ValueError(
|
|
"For MMR search, you must specify an embedding function on" "creation."
|
|
)
|
|
|
|
embedding = self._embedding_function.embed_query(query)
|
|
docs = self.max_marginal_relevance_search_by_vector(
|
|
embedding,
|
|
k,
|
|
fetch_k,
|
|
lambda_mult=lambda_mult,
|
|
filter=filter,
|
|
where_document=where_document,
|
|
)
|
|
return docs
|
|
|
|
def delete_collection(self) -> None:
|
|
"""Delete the collection."""
|
|
self._client.delete_collection(self._collection.name)
|
|
|
|
def get(
|
|
self,
|
|
ids: Optional[OneOrMany[ID]] = None,
|
|
where: Optional[Where] = None,
|
|
limit: Optional[int] = None,
|
|
offset: Optional[int] = None,
|
|
where_document: Optional[WhereDocument] = None,
|
|
include: Optional[List[str]] = None,
|
|
) -> Dict[str, Any]:
|
|
"""Gets the collection.
|
|
|
|
Args:
|
|
ids: The ids of the embeddings to get. Optional.
|
|
where: A Where type dict used to filter results by.
|
|
E.g. `{"color" : "red", "price": 4.20}`. Optional.
|
|
limit: The number of documents to return. Optional.
|
|
offset: The offset to start returning results from.
|
|
Useful for paging results with limit. Optional.
|
|
where_document: A WhereDocument type dict used to filter by the documents.
|
|
E.g. `{$contains: "hello"}`. Optional.
|
|
include: A list of what to include in the results.
|
|
Can contain `"embeddings"`, `"metadatas"`, `"documents"`.
|
|
Ids are always included.
|
|
Defaults to `["metadatas", "documents"]`. Optional.
|
|
"""
|
|
kwargs = {
|
|
"ids": ids,
|
|
"where": where,
|
|
"limit": limit,
|
|
"offset": offset,
|
|
"where_document": where_document,
|
|
}
|
|
|
|
if include is not None:
|
|
kwargs["include"] = include
|
|
|
|
return self._collection.get(**kwargs)
|
|
|
|
def persist(self) -> None:
|
|
"""Persist the collection.
|
|
|
|
This can be used to explicitly persist the data to disk.
|
|
It will also be called automatically when the object is destroyed.
|
|
"""
|
|
if self._persist_directory is None:
|
|
raise ValueError(
|
|
"You must specify a persist_directory on"
|
|
"creation to persist the collection."
|
|
)
|
|
import chromadb
|
|
|
|
# Maintain backwards compatibility with chromadb < 0.4.0
|
|
major, minor, _ = chromadb.__version__.split(".")
|
|
if int(major) == 0 and int(minor) < 4:
|
|
self._client.persist()
|
|
|
|
def update_document(self, document_id: str, document: Document) -> None:
|
|
"""Update a document in the collection.
|
|
|
|
Args:
|
|
document_id (str): ID of the document to update.
|
|
document (Document): Document to update.
|
|
"""
|
|
return self.update_documents([document_id], [document])
|
|
|
|
def update_documents(self, ids: List[str], documents: List[Document]) -> None:
|
|
"""Update a document in the collection.
|
|
|
|
Args:
|
|
ids (List[str]): List of ids of the document to update.
|
|
documents (List[Document]): List of documents to update.
|
|
"""
|
|
text = [document.page_content for document in documents]
|
|
metadata = [document.metadata for document in documents]
|
|
if self._embedding_function is None:
|
|
raise ValueError(
|
|
"For update, you must specify an embedding function on creation."
|
|
)
|
|
embeddings = self._embedding_function.embed_documents(text)
|
|
|
|
if hasattr(
|
|
self._collection._client, "max_batch_size"
|
|
): # for Chroma 0.4.10 and above
|
|
from chromadb.utils.batch_utils import create_batches
|
|
|
|
for batch in create_batches(
|
|
api=self._collection._client,
|
|
ids=ids,
|
|
metadatas=metadata,
|
|
documents=text,
|
|
embeddings=embeddings,
|
|
):
|
|
self._collection.update(
|
|
ids=batch[0],
|
|
embeddings=batch[1],
|
|
documents=batch[3],
|
|
metadatas=batch[2],
|
|
)
|
|
else:
|
|
self._collection.update(
|
|
ids=ids,
|
|
embeddings=embeddings,
|
|
documents=text,
|
|
metadatas=metadata,
|
|
)
|
|
|
|
@classmethod
|
|
def from_texts(
|
|
cls: Type[Chroma],
|
|
texts: List[str],
|
|
embedding: Optional[Embeddings] = None,
|
|
metadatas: Optional[List[dict]] = None,
|
|
ids: Optional[List[str]] = None,
|
|
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
|
|
persist_directory: Optional[str] = None,
|
|
client_settings: Optional[chromadb.config.Settings] = None,
|
|
client: Optional[chromadb.Client] = None,
|
|
collection_metadata: Optional[Dict] = None,
|
|
**kwargs: Any,
|
|
) -> Chroma:
|
|
"""Create a Chroma vectorstore from a raw documents.
|
|
|
|
If a persist_directory is specified, the collection will be persisted there.
|
|
Otherwise, the data will be ephemeral in-memory.
|
|
|
|
Args:
|
|
texts (List[str]): List of texts to add to the collection.
|
|
collection_name (str): Name of the collection to create.
|
|
persist_directory (Optional[str]): Directory to persist the collection.
|
|
embedding (Optional[Embeddings]): Embedding function. Defaults to None.
|
|
metadatas (Optional[List[dict]]): List of metadatas. Defaults to None.
|
|
ids (Optional[List[str]]): List of document IDs. Defaults to None.
|
|
client_settings (Optional[chromadb.config.Settings]): Chroma client settings
|
|
collection_metadata (Optional[Dict]): Collection configurations.
|
|
Defaults to None.
|
|
|
|
Returns:
|
|
Chroma: Chroma vectorstore.
|
|
"""
|
|
chroma_collection = cls(
|
|
collection_name=collection_name,
|
|
embedding_function=embedding,
|
|
persist_directory=persist_directory,
|
|
client_settings=client_settings,
|
|
client=client,
|
|
collection_metadata=collection_metadata,
|
|
**kwargs,
|
|
)
|
|
if ids is None:
|
|
ids = [str(uuid.uuid1()) for _ in texts]
|
|
if hasattr(
|
|
chroma_collection._client, "max_batch_size"
|
|
): # for Chroma 0.4.10 and above
|
|
from chromadb.utils.batch_utils import create_batches
|
|
|
|
for batch in create_batches(
|
|
api=chroma_collection._client,
|
|
ids=ids,
|
|
metadatas=metadatas,
|
|
documents=texts,
|
|
):
|
|
chroma_collection.add_texts(
|
|
texts=batch[3] if batch[3] else [],
|
|
metadatas=batch[2] if batch[2] else None,
|
|
ids=batch[0],
|
|
)
|
|
else:
|
|
chroma_collection.add_texts(texts=texts, metadatas=metadatas, ids=ids)
|
|
return chroma_collection
|
|
|
|
@classmethod
|
|
def from_documents(
|
|
cls: Type[Chroma],
|
|
documents: List[Document],
|
|
embedding: Optional[Embeddings] = None,
|
|
ids: Optional[List[str]] = None,
|
|
collection_name: str = _LANGCHAIN_DEFAULT_COLLECTION_NAME,
|
|
persist_directory: Optional[str] = None,
|
|
client_settings: Optional[chromadb.config.Settings] = None,
|
|
client: Optional[chromadb.Client] = None, # Add this line
|
|
collection_metadata: Optional[Dict] = None,
|
|
**kwargs: Any,
|
|
) -> Chroma:
|
|
"""Create a Chroma vectorstore from a list of documents.
|
|
|
|
If a persist_directory is specified, the collection will be persisted there.
|
|
Otherwise, the data will be ephemeral in-memory.
|
|
|
|
Args:
|
|
collection_name (str): Name of the collection to create.
|
|
persist_directory (Optional[str]): Directory to persist the collection.
|
|
ids (Optional[List[str]]): List of document IDs. Defaults to None.
|
|
documents (List[Document]): List of documents to add to the vectorstore.
|
|
embedding (Optional[Embeddings]): Embedding function. Defaults to None.
|
|
client_settings (Optional[chromadb.config.Settings]): Chroma client settings
|
|
collection_metadata (Optional[Dict]): Collection configurations.
|
|
Defaults to None.
|
|
|
|
Returns:
|
|
Chroma: Chroma vectorstore.
|
|
"""
|
|
texts = [doc.page_content for doc in documents]
|
|
metadatas = [doc.metadata for doc in documents]
|
|
return cls.from_texts(
|
|
texts=texts,
|
|
embedding=embedding,
|
|
metadatas=metadatas,
|
|
ids=ids,
|
|
collection_name=collection_name,
|
|
persist_directory=persist_directory,
|
|
client_settings=client_settings,
|
|
client=client,
|
|
collection_metadata=collection_metadata,
|
|
**kwargs,
|
|
)
|
|
|
|
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> None:
|
|
"""Delete by vector IDs.
|
|
|
|
Args:
|
|
ids: List of ids to delete.
|
|
"""
|
|
self._collection.delete(ids=ids)
|