Bagatur/from texts bug fix (#8394)

---------

Co-authored-by: Davit Buniatyan <davit@loqsh.com>
Co-authored-by: Davit Buniatyan <d@activeloop.ai>
Co-authored-by: adilkhan <adilkhan.sarsen@nu.edu.kz>
Co-authored-by: Ivo Stranic <istranic@gmail.com>
This commit is contained in:
Bagatur 2023-07-27 21:52:38 -07:00 committed by GitHub
parent 1efb9bae5f
commit a1a650c743
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 565 additions and 38 deletions

View File

@ -56,6 +56,7 @@ class DeepLake(VectorStore):
self, self,
dataset_path: str = _LANGCHAIN_DEFAULT_DEEPLAKE_PATH, dataset_path: str = _LANGCHAIN_DEFAULT_DEEPLAKE_PATH,
token: Optional[str] = None, token: Optional[str] = None,
embedding: Optional[Embeddings] = None,
embedding_function: Optional[Embeddings] = None, embedding_function: Optional[Embeddings] = None,
read_only: bool = False, read_only: bool = False,
ingestion_batch_size: int = 1000, ingestion_batch_size: int = 1000,
@ -86,8 +87,11 @@ class DeepLake(VectorStore):
token (str, optional): Activeloop token, for fetching credentials token (str, optional): Activeloop token, for fetching credentials
to the dataset at path if it is a Deep Lake dataset. to the dataset at path if it is a Deep Lake dataset.
Tokens are normally autogenerated. Optional. Tokens are normally autogenerated. Optional.
embedding_function (str, optional): Function to convert embedding (Embeddings, optional): Function to convert
either documents or query. Optional. either documents or query. Optional.
embedding_function (Embeddings, optional): Function to convert
either documents or query. Optional. Deprecated: keeping this
parameter for backwards compatibility.
read_only (bool): Open dataset in read-only mode. Default is False. read_only (bool): Open dataset in read-only mode. Default is False.
ingestion_batch_size (int): During data ingestion, data is divided ingestion_batch_size (int): During data ingestion, data is divided
into batches. Batch size is the size of each batch. into batches. Batch size is the size of each batch.
@ -138,9 +142,14 @@ class DeepLake(VectorStore):
self.dataset_path = dataset_path self.dataset_path = dataset_path
logger.warning(
"Using embedding function is deprecated and will be removed "
"in the future. Please use embedding instead."
)
self.vectorstore = DeepLakeVectorStore( self.vectorstore = DeepLakeVectorStore(
path=self.dataset_path, path=self.dataset_path,
embedding_function=embedding_function, embedding_function=embedding_function or embedding,
read_only=read_only, read_only=read_only,
token=token, token=token,
exec_option=exec_option, exec_option=exec_option,
@ -148,7 +157,7 @@ class DeepLake(VectorStore):
**kwargs, **kwargs,
) )
self._embedding_function = embedding_function self._embedding_function = embedding_function or embedding
self._id_tensor_name = "ids" if "ids" in self.vectorstore.tensors() else "id" self._id_tensor_name = "ids" if "ids" in self.vectorstore.tensors() else "id"
@property @property
@ -719,7 +728,6 @@ class DeepLake(VectorStore):
metadatas: Optional[List[dict]] = None, metadatas: Optional[List[dict]] = None,
ids: Optional[List[str]] = None, ids: Optional[List[str]] = None,
dataset_path: str = _LANGCHAIN_DEFAULT_DEEPLAKE_PATH, dataset_path: str = _LANGCHAIN_DEFAULT_DEEPLAKE_PATH,
embedding_function: Optional[Embeddings] = None,
**kwargs: Any, **kwargs: Any,
) -> DeepLake: ) -> DeepLake:
"""Create a Deep Lake dataset from a raw documents. """Create a Deep Lake dataset from a raw documents.
@ -761,20 +769,8 @@ class DeepLake(VectorStore):
Returns: Returns:
DeepLake: Deep Lake dataset. DeepLake: Deep Lake dataset.
Raises:
ValueError: If 'embedding' is provided in kwargs. This is deprecated,
please use `embedding_function` instead.
""" """
if embedding: deeplake_dataset = cls(dataset_path=dataset_path, embedding=embedding, **kwargs)
raise ValueError(
"using embedding as embedidng_functions is deprecated. "
"Please use `embedding_function` instead."
)
deeplake_dataset = cls(
dataset_path=dataset_path, embedding_function=embedding_function, **kwargs
)
deeplake_dataset.add_texts( deeplake_dataset.add_texts(
texts=texts, texts=texts,
metadatas=metadatas, metadatas=metadatas,

File diff suppressed because it is too large Load Diff

View File

@ -137,10 +137,11 @@ def test_similarity_search(deeplake_datastore: DeepLake, distance_metric: str) -
f"SELECT * WHERE " f"SELECT * WHERE "
f"id=='{deeplake_datastore.vectorstore.dataset.id[0].numpy()[0]}'" f"id=='{deeplake_datastore.vectorstore.dataset.id[0].numpy()[0]}'"
) )
with pytest.raises(ValueError):
output = deeplake_datastore.similarity_search( output = deeplake_datastore.similarity_search(
query="foo", tql_query=tql_query, k=1, distance_metric=distance_metric query="foo", tql_query=tql_query, k=1, distance_metric=distance_metric
) )
assert len(output) == 1
deeplake_datastore.delete_dataset() deeplake_datastore.delete_dataset()