Dev2049/hf emb encode kwargs (#3925)

Thanks @amogkam for the addition! Refactored slightly

---------

Co-authored-by: Amog Kamsetty <amogkam@users.noreply.github.com>
fix_agent_callbacks
Davis Chase 1 year ago committed by GitHub
parent ffc87233a1
commit 5db6b796cf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -36,6 +36,8 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
Can be also set by SENTENCE_TRANSFORMERS_HOME enviroment variable.""" Can be also set by SENTENCE_TRANSFORMERS_HOME enviroment variable."""
model_kwargs: Dict[str, Any] = Field(default_factory=dict) model_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Key word arguments to pass to the model.""" """Key word arguments to pass to the model."""
encode_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Key word arguments to pass when calling the `encode` method of the model."""
def __init__(self, **kwargs: Any): def __init__(self, **kwargs: Any):
"""Initialize the sentence_transformer.""" """Initialize the sentence_transformer."""
@ -68,7 +70,7 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
List of embeddings, one for each text. List of embeddings, one for each text.
""" """
texts = list(map(lambda x: x.replace("\n", " "), texts)) texts = list(map(lambda x: x.replace("\n", " "), texts))
embeddings = self.client.encode(texts) embeddings = self.client.encode(texts, **self.encode_kwargs)
return embeddings.tolist() return embeddings.tolist()
def embed_query(self, text: str) -> List[float]: def embed_query(self, text: str) -> List[float]:
@ -81,7 +83,7 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
Embeddings for the text. Embeddings for the text.
""" """
text = text.replace("\n", " ") text = text.replace("\n", " ")
embedding = self.client.encode(text) embedding = self.client.encode(text, **self.encode_kwargs)
return embedding.tolist() return embedding.tolist()
@ -89,7 +91,7 @@ class HuggingFaceInstructEmbeddings(BaseModel, Embeddings):
"""Wrapper around sentence_transformers embedding models. """Wrapper around sentence_transformers embedding models.
To use, you should have the ``sentence_transformers`` To use, you should have the ``sentence_transformers``
and ``InstructorEmbedding`` python package installed. and ``InstructorEmbedding`` python packages installed.
Example: Example:
.. code-block:: python .. code-block:: python
@ -108,7 +110,7 @@ class HuggingFaceInstructEmbeddings(BaseModel, Embeddings):
"""Model name to use.""" """Model name to use."""
cache_folder: Optional[str] = None cache_folder: Optional[str] = None
"""Path to store models. """Path to store models.
Can be also set by SENTENCE_TRANSFORMERS_HOME enviroment variable.""" Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable."""
model_kwargs: Dict[str, Any] = Field(default_factory=dict) model_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Key word arguments to pass to the model.""" """Key word arguments to pass to the model."""
embed_instruction: str = DEFAULT_EMBED_INSTRUCTION embed_instruction: str = DEFAULT_EMBED_INSTRUCTION

@ -1,5 +1,4 @@
"""Test huggingface embeddings.""" """Test huggingface embeddings."""
import unittest
from langchain.embeddings.huggingface import ( from langchain.embeddings.huggingface import (
HuggingFaceEmbeddings, HuggingFaceEmbeddings,
@ -7,7 +6,6 @@ from langchain.embeddings.huggingface import (
) )
@unittest.skip("This test causes a segfault.")
def test_huggingface_embedding_documents() -> None: def test_huggingface_embedding_documents() -> None:
"""Test huggingface embeddings.""" """Test huggingface embeddings."""
documents = ["foo bar"] documents = ["foo bar"]
@ -17,11 +15,10 @@ def test_huggingface_embedding_documents() -> None:
assert len(output[0]) == 768 assert len(output[0]) == 768
@unittest.skip("This test causes a segfault.")
def test_huggingface_embedding_query() -> None: def test_huggingface_embedding_query() -> None:
"""Test huggingface embeddings.""" """Test huggingface embeddings."""
document = "foo bar" document = "foo bar"
embedding = HuggingFaceEmbeddings() embedding = HuggingFaceEmbeddings(encode_kwargs={"batch_size": 16})
output = embedding.embed_query(document) output = embedding.embed_query(document)
assert len(output) == 768 assert len(output) == 768

Loading…
Cancel
Save