forked from Archives/langchain
parent
aaad6cc954
commit
166cda2cc6
@ -0,0 +1,211 @@
|
|||||||
|
"""Wrapper around Activeloop Deep Lake."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import uuid
|
||||||
|
from typing import Any, Iterable, List, Optional, Sequence
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.embeddings.base import Embeddings
|
||||||
|
from langchain.vectorstores.base import VectorStore
|
||||||
|
|
||||||
|
logger = logging.getLogger()
|
||||||
|
|
||||||
|
|
||||||
|
def L2_search(
|
||||||
|
query_embedding: np.ndarray, data_vectors: np.ndarray, k: int = 4
|
||||||
|
) -> list:
|
||||||
|
"""naive L2 search for nearest neighbors"""
|
||||||
|
# Calculate the L2 distance between the query_vector and all data_vectors
|
||||||
|
distances = np.linalg.norm(data_vectors - query_embedding, axis=1)
|
||||||
|
|
||||||
|
# Sort the distances and return the indices of the k nearest vectors
|
||||||
|
nearest_indices = np.argsort(distances)[:k]
|
||||||
|
return nearest_indices.tolist()
|
||||||
|
|
||||||
|
|
||||||
|
class DeepLake(VectorStore):
|
||||||
|
"""Wrapper around Deep Lake, a data lake for deep learning applications.
|
||||||
|
|
||||||
|
It not only stores embeddings, but also the original data and queries with
|
||||||
|
version control automatically enabled.
|
||||||
|
|
||||||
|
It is more than just a vector store. You can use the dataset to fine-tune
|
||||||
|
your own LLM models or use it for other downstream tasks.
|
||||||
|
|
||||||
|
We implement naive similiarity search, but it can be extended with Tensor
|
||||||
|
Query Language (TQL for production use cases) over billion rows.
|
||||||
|
|
||||||
|
To use, you should have the ``deeplake`` python package installed.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
from langchain.vectorstores import DeepLake
|
||||||
|
from langchain.embeddings.openai import OpenAIEmbeddings
|
||||||
|
|
||||||
|
embeddings = OpenAIEmbeddings()
|
||||||
|
vectorstore = DeepLake("langchain_store", embeddings.embed_query)
|
||||||
|
"""
|
||||||
|
|
||||||
|
_LANGCHAIN_DEFAULT_DEEPLAKE_PATH = "mem://langchain"
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
dataset_path: str = _LANGCHAIN_DEFAULT_DEEPLAKE_PATH,
|
||||||
|
token: Optional[str] = None,
|
||||||
|
embedding_function: Optional[Embeddings] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Initialize with Deep Lake client."""
|
||||||
|
|
||||||
|
try:
|
||||||
|
import deeplake
|
||||||
|
except ImportError:
|
||||||
|
raise ValueError(
|
||||||
|
"Could not import deeplake python package. "
|
||||||
|
"Please it install it with `pip install deeplake`."
|
||||||
|
)
|
||||||
|
self._deeplake = deeplake
|
||||||
|
|
||||||
|
if deeplake.exists(dataset_path, token=token):
|
||||||
|
self.ds = deeplake.load(dataset_path, token=token)
|
||||||
|
logger.warning(
|
||||||
|
f"Deep Lake Dataset in {dataset_path} already exists, "
|
||||||
|
f"loading from the storage"
|
||||||
|
)
|
||||||
|
self.ds.summary()
|
||||||
|
else:
|
||||||
|
self.ds = deeplake.empty(dataset_path, token=token, overwrite=True)
|
||||||
|
with self.ds:
|
||||||
|
self.ds.create_tensor("text", htype="text")
|
||||||
|
self.ds.create_tensor("metadata", htype="json")
|
||||||
|
self.ds.create_tensor("embedding", htype="generic")
|
||||||
|
self.ds.create_tensor("ids", htype="text")
|
||||||
|
|
||||||
|
self._embedding_function = embedding_function
|
||||||
|
|
||||||
|
def add_texts(
|
||||||
|
self,
|
||||||
|
texts: Iterable[str],
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
ids: Optional[List[str]] = None,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[str]:
|
||||||
|
"""Run more texts through the embeddings and add to the vectorstore.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
texts (Iterable[str]): Texts to add to the vectorstore.
|
||||||
|
metadatas (Optional[List[dict]], optional): Optional list of metadatas.
|
||||||
|
ids (Optional[List[str]], optional): Optional list of IDs.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[str]: List of IDs of the added texts.
|
||||||
|
"""
|
||||||
|
|
||||||
|
if ids is None:
|
||||||
|
ids = [str(uuid.uuid1()) for _ in texts]
|
||||||
|
|
||||||
|
text_list = list(texts)
|
||||||
|
|
||||||
|
if self._embedding_function is None:
|
||||||
|
embeddings: Sequence[Optional[List[float]]] = [None] * len(text_list)
|
||||||
|
else:
|
||||||
|
embeddings = self._embedding_function.embed_documents(text_list)
|
||||||
|
|
||||||
|
if metadatas is None:
|
||||||
|
metadatas_to_use: Sequence[Optional[dict]] = [None] * len(text_list)
|
||||||
|
else:
|
||||||
|
metadatas_to_use = metadatas
|
||||||
|
|
||||||
|
elements = zip(text_list, embeddings, metadatas_to_use, ids)
|
||||||
|
|
||||||
|
@self._deeplake.compute
|
||||||
|
def ingest(sample_in: list, sample_out: list) -> None:
|
||||||
|
s = {
|
||||||
|
"text": sample_in[0],
|
||||||
|
"embedding": sample_in[1],
|
||||||
|
"metadata": sample_in[2],
|
||||||
|
"ids": sample_in[3],
|
||||||
|
}
|
||||||
|
sample_out.append(s)
|
||||||
|
|
||||||
|
ingest().eval(list(elements), self.ds)
|
||||||
|
self.ds.commit()
|
||||||
|
|
||||||
|
return ids
|
||||||
|
|
||||||
|
def similarity_search(
|
||||||
|
self, query: str, k: int = 4, **kwargs: Any
|
||||||
|
) -> List[Document]:
|
||||||
|
"""Return docs most similar to query."""
|
||||||
|
if self._embedding_function is None:
|
||||||
|
self.ds.summary()
|
||||||
|
ds_view = self.ds.filter(lambda x: query in x["text"].data()["value"])
|
||||||
|
else:
|
||||||
|
query_emb = np.array(self._embedding_function.embed_query(query))
|
||||||
|
embeddings = self.ds.embedding.numpy()
|
||||||
|
indices = L2_search(query_emb, embeddings, k=k)
|
||||||
|
ds_view = self.ds[indices]
|
||||||
|
|
||||||
|
docs = [
|
||||||
|
Document(
|
||||||
|
page_content=el["text"].data()["value"],
|
||||||
|
metadata=el["metadata"].data()["value"],
|
||||||
|
)
|
||||||
|
for el in ds_view
|
||||||
|
]
|
||||||
|
return docs
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_texts(
|
||||||
|
cls,
|
||||||
|
texts: List[str],
|
||||||
|
embedding: Optional[Embeddings] = None,
|
||||||
|
metadatas: Optional[List[dict]] = None,
|
||||||
|
ids: Optional[List[str]] = None,
|
||||||
|
dataset_path: str = _LANGCHAIN_DEFAULT_DEEPLAKE_PATH,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> DeepLake:
|
||||||
|
"""Create a Deep Lake dataset from a raw documents.
|
||||||
|
|
||||||
|
If a persist_directory is specified, the collection will be persisted there.
|
||||||
|
Otherwise, the data will be ephemeral in-memory.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
path (str, pathlib.Path): - The full path to the dataset. Can be:
|
||||||
|
- a Deep Lake cloud path of the form ``hub://username/datasetname``.
|
||||||
|
To write to Deep Lake cloud datasets,
|
||||||
|
ensure that you are logged in to Deep Lake
|
||||||
|
(use 'activeloop login' from command line)
|
||||||
|
- an s3 path of the form ``s3://bucketname/path/to/dataset``.
|
||||||
|
Credentials are required in either the environment or
|
||||||
|
passed to the creds argument.
|
||||||
|
- a local file system path of the form ``./path/to/dataset`` or
|
||||||
|
``~/path/to/dataset`` or ``path/to/dataset``.
|
||||||
|
- a memory path of the form ``mem://path/to/dataset`` which doesn't
|
||||||
|
save the dataset but keeps it in memory instead.
|
||||||
|
Should be used only for testing as it does not persist.
|
||||||
|
documents (List[Document]): List of documents to add.
|
||||||
|
embedding (Optional[Embeddings]): Embedding function. Defaults to None.
|
||||||
|
metadatas (Optional[List[dict]]): List of metadatas. Defaults to None.
|
||||||
|
ids (Optional[List[str]]): List of document IDs. Defaults to None.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
DeepLake: Deep Lake dataset.
|
||||||
|
"""
|
||||||
|
deeplake_dataset = cls(
|
||||||
|
dataset_path=dataset_path,
|
||||||
|
embedding_function=embedding,
|
||||||
|
)
|
||||||
|
deeplake_dataset.add_texts(texts=texts, metadatas=metadatas, ids=ids)
|
||||||
|
return deeplake_dataset
|
||||||
|
|
||||||
|
def delete_dataset(self) -> None:
|
||||||
|
"""Delete the collection."""
|
||||||
|
self.ds.delete()
|
||||||
|
|
||||||
|
def persist(self) -> None:
|
||||||
|
"""Persist the collection."""
|
||||||
|
self.ds.flush()
|
@ -0,0 +1,58 @@
|
|||||||
|
"""Test Deep Lake functionality."""
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.vectorstores import DeepLake
|
||||||
|
from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings
|
||||||
|
|
||||||
|
|
||||||
|
def test_deeplake() -> None:
|
||||||
|
"""Test end to end construction and search."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
docsearch = DeepLake.from_texts(
|
||||||
|
dataset_path="mem://test_path", texts=texts, embedding=FakeEmbeddings()
|
||||||
|
)
|
||||||
|
output = docsearch.similarity_search("foo", k=1)
|
||||||
|
assert output == [Document(page_content="foo")]
|
||||||
|
|
||||||
|
|
||||||
|
def test_deeplake_with_metadatas() -> None:
|
||||||
|
"""Test end to end construction and search."""
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
metadatas = [{"page": str(i)} for i in range(len(texts))]
|
||||||
|
docsearch = DeepLake.from_texts(
|
||||||
|
dataset_path="mem://test_path",
|
||||||
|
texts=texts,
|
||||||
|
embedding=FakeEmbeddings(),
|
||||||
|
metadatas=metadatas,
|
||||||
|
)
|
||||||
|
output = docsearch.similarity_search("foo", k=1)
|
||||||
|
assert output == [Document(page_content="foo", metadata={"page": "0"})]
|
||||||
|
|
||||||
|
|
||||||
|
def test_deeplakewith_persistence() -> None:
|
||||||
|
"""Test end to end construction and search, with persistence."""
|
||||||
|
dataset_path = "./tests/persist_dir"
|
||||||
|
texts = ["foo", "bar", "baz"]
|
||||||
|
docsearch = DeepLake.from_texts(
|
||||||
|
dataset_path=dataset_path,
|
||||||
|
texts=texts,
|
||||||
|
embedding=FakeEmbeddings(),
|
||||||
|
)
|
||||||
|
|
||||||
|
output = docsearch.similarity_search("foo", k=1)
|
||||||
|
assert output == [Document(page_content="foo")]
|
||||||
|
|
||||||
|
docsearch.persist()
|
||||||
|
|
||||||
|
# Get a new VectorStore from the persisted directory
|
||||||
|
docsearch = DeepLake(
|
||||||
|
dataset_path=dataset_path,
|
||||||
|
embedding_function=FakeEmbeddings(),
|
||||||
|
)
|
||||||
|
output = docsearch.similarity_search("foo", k=1)
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
docsearch.delete_dataset()
|
||||||
|
|
||||||
|
# Persist doesn't need to be called again
|
||||||
|
# Data will be automatically persisted on object deletion
|
||||||
|
# Or on program exit
|
Loading…
Reference in New Issue