langchain/libs/community/langchain_community/vectorstores/docarray/hnsw.py
Erick Friis 781aee0068
community, langchain, infra: revert store extended test deps outside of poetry (#19153)
Reverts langchain-ai/langchain#18995

Because it makes installing dependencies in python 3.11 extended testing
take 80 minutes
2024-03-15 17:10:47 +00:00

110 lines
3.9 KiB
Python

from __future__ import annotations
from typing import Any, List, Literal, Optional
from langchain_core.embeddings import Embeddings
from langchain_community.vectorstores.docarray.base import (
DocArrayIndex,
_check_docarray_import,
)
class DocArrayHnswSearch(DocArrayIndex):
"""`HnswLib` storage using `DocArray` package.
To use it, you should have the ``docarray`` package with version >=0.32.0 installed.
You can install it with `pip install "langchain[docarray]"`.
"""
@classmethod
def from_params(
cls,
embedding: Embeddings,
work_dir: str,
n_dim: int,
dist_metric: Literal["cosine", "ip", "l2"] = "cosine",
max_elements: int = 1024,
index: bool = True,
ef_construction: int = 200,
ef: int = 10,
M: int = 16,
allow_replace_deleted: bool = True,
num_threads: int = 1,
**kwargs: Any,
) -> DocArrayHnswSearch:
"""Initialize DocArrayHnswSearch store.
Args:
embedding (Embeddings): Embedding function.
work_dir (str): path to the location where all the data will be stored.
n_dim (int): dimension of an embedding.
dist_metric (str): Distance metric for DocArrayHnswSearch can be one of:
"cosine", "ip", and "l2". Defaults to "cosine".
max_elements (int): Maximum number of vectors that can be stored.
Defaults to 1024.
index (bool): Whether an index should be built for this field.
Defaults to True.
ef_construction (int): defines a construction time/accuracy trade-off.
Defaults to 200.
ef (int): parameter controlling query time/accuracy trade-off.
Defaults to 10.
M (int): parameter that defines the maximum number of outgoing
connections in the graph. Defaults to 16.
allow_replace_deleted (bool): Enables replacing of deleted elements
with new added ones. Defaults to True.
num_threads (int): Sets the number of cpu threads to use. Defaults to 1.
**kwargs: Other keyword arguments to be passed to the get_doc_cls method.
"""
_check_docarray_import()
from docarray.index import HnswDocumentIndex
doc_cls = cls._get_doc_cls(
dim=n_dim,
space=dist_metric,
max_elements=max_elements,
index=index,
ef_construction=ef_construction,
ef=ef,
M=M,
allow_replace_deleted=allow_replace_deleted,
num_threads=num_threads,
**kwargs,
)
doc_index = HnswDocumentIndex[doc_cls](work_dir=work_dir) # type: ignore
return cls(doc_index, embedding)
@classmethod
def from_texts(
cls,
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
work_dir: Optional[str] = None,
n_dim: Optional[int] = None,
**kwargs: Any,
) -> DocArrayHnswSearch:
"""Create an DocArrayHnswSearch store and insert data.
Args:
texts (List[str]): Text data.
embedding (Embeddings): Embedding function.
metadatas (Optional[List[dict]]): Metadata for each text if it exists.
Defaults to None.
work_dir (str): path to the location where all the data will be stored.
n_dim (int): dimension of an embedding.
**kwargs: Other keyword arguments to be passed to the __init__ method.
Returns:
DocArrayHnswSearch Vector Store
"""
if work_dir is None:
raise ValueError("`work_dir` parameter has not been set.")
if n_dim is None:
raise ValueError("`n_dim` parameter has not been set.")
store = cls.from_params(embedding, work_dir, n_dim, **kwargs)
store.add_texts(texts=texts, metadatas=metadatas)
return store