diff --git a/docs/extras/integrations/text_embedding/bge_huggingface.ipynb b/docs/extras/integrations/text_embedding/bge_huggingface.ipynb new file mode 100644 index 0000000000..bcf196fc20 --- /dev/null +++ b/docs/extras/integrations/text_embedding/bge_huggingface.ipynb @@ -0,0 +1,84 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "719619d3", + "metadata": {}, + "source": [ + "# BGE Hugging Face Embeddings\n", + "\n", + "This notebook shows how to use BGE Embeddings through Hugging Face" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f7a54279", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# !pip install sentence_transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "9e1d5b6b", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings import HuggingFaceBgeEmbeddings\n", + "\n", + "model_name = \"BAAI/bge-small-en\"\n", + "model_kwargs = {'device': 'cpu'}\n", + "encode_kwargs = {'normalize_embeddings': False}\n", + "hf = HuggingFaceBgeEmbeddings(\n", + " model_name=model_name,\n", + " model_kwargs=model_kwargs,\n", + " encode_kwargs=encode_kwargs\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e59d1a89", + "metadata": {}, + "outputs": [], + "source": [ + "embedding = hf.embed_query(\"hi this is harrison\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e596315f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/langchain/langchain/embeddings/__init__.py b/libs/langchain/langchain/embeddings/__init__.py index ee572fd185..f81d1e1173 100644 --- a/libs/langchain/langchain/embeddings/__init__.py +++ b/libs/langchain/langchain/embeddings/__init__.py @@ -31,6 +31,7 @@ from langchain.embeddings.fake import DeterministicFakeEmbedding, FakeEmbeddings from langchain.embeddings.google_palm import GooglePalmEmbeddings from langchain.embeddings.gpt4all import GPT4AllEmbeddings from langchain.embeddings.huggingface import ( + HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings, ) @@ -97,6 +98,7 @@ __all__ = [ "XinferenceEmbeddings", "LocalAIEmbeddings", "AwaEmbeddings", + "HuggingFaceBgeEmbeddings", ] diff --git a/libs/langchain/langchain/embeddings/huggingface.py b/libs/langchain/langchain/embeddings/huggingface.py index 3531c5537f..afbd0cd806 100644 --- a/libs/langchain/langchain/embeddings/huggingface.py +++ b/libs/langchain/langchain/embeddings/huggingface.py @@ -6,10 +6,17 @@ from langchain.embeddings.base import Embeddings DEFAULT_MODEL_NAME = "sentence-transformers/all-mpnet-base-v2" DEFAULT_INSTRUCT_MODEL = "hkunlp/instructor-large" +DEFAULT_BGE_MODEL = "BAAI/bge-large-en" DEFAULT_EMBED_INSTRUCTION = "Represent the document for retrieval: " DEFAULT_QUERY_INSTRUCTION = ( "Represent the question for retrieving supporting documents: " ) +DEFAULT_EMBED_BGE_INSTRUCTION = ( + "Represent this sentence for searching relevant passages: " +) +DEFAULT_QUERY_BGE_INSTRUCTION = ( + "Represent this question for searching relevant passages: " +) class HuggingFaceEmbeddings(BaseModel, Embeddings): @@ -169,3 +176,86 @@ class HuggingFaceInstructEmbeddings(BaseModel, Embeddings): instruction_pair = [self.query_instruction, text] embedding = self.client.encode([instruction_pair], **self.encode_kwargs)[0] return embedding.tolist() + + +class HuggingFaceBgeEmbeddings(BaseModel, Embeddings): + """HuggingFace BGE sentence_transformers embedding models. + + To use, you should have the ``sentence_transformers`` python package installed. + + Example: + .. code-block:: python + + from langchain.embeddings import HuggingFaceBgeEmbeddings + + model_name = "BAAI/bge-large-en" + model_kwargs = {'device': 'cpu'} + encode_kwargs = {'normalize_embeddings': False} + hf = HuggingFaceBgeEmbeddings( + model_name=model_name, + model_kwargs=model_kwargs, + encode_kwargs=encode_kwargs + ) + """ + + client: Any #: :meta private: + model_name: str = DEFAULT_BGE_MODEL + """Model name to use.""" + cache_folder: Optional[str] = None + """Path to store models. + Can be also set by SENTENCE_TRANSFORMERS_HOME environment variable.""" + model_kwargs: Dict[str, Any] = Field(default_factory=dict) + """Key word arguments to pass to the model.""" + encode_kwargs: Dict[str, Any] = Field(default_factory=dict) + """Key word arguments to pass when calling the `encode` method of the model.""" + embed_instruction: str = DEFAULT_EMBED_BGE_INSTRUCTION + """Instruction to use for embedding documents.""" + query_instruction: str = DEFAULT_QUERY_BGE_INSTRUCTION + """Instruction to use for embedding query.""" + + def __init__(self, **kwargs: Any): + """Initialize the sentence_transformer.""" + super().__init__(**kwargs) + try: + import sentence_transformers + + except ImportError as exc: + raise ImportError( + "Could not import sentence_transformers python package. " + "Please install it with `pip install sentence_transformers`." + ) from exc + + self.client = sentence_transformers.SentenceTransformer( + self.model_name, cache_folder=self.cache_folder, **self.model_kwargs + ) + + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Compute doc embeddings using a HuggingFace transformer model. + + Args: + texts: The list of texts to embed. + + Returns: + List of embeddings, one for each text. + """ + instruction_pairs = [[self.embed_instruction, text] for text in texts] + embeddings = self.client.encode(instruction_pairs, **self.encode_kwargs) + return embeddings.tolist() + + def embed_query(self, text: str) -> List[float]: + """Compute query embeddings using a HuggingFace transformer model. + + Args: + text: The text to embed. + + Returns: + Embeddings for the text. + """ + instruction_pair = [self.query_instruction, text] + embedding = self.client.encode([instruction_pair], **self.encode_kwargs)[0] + return embedding.tolist()