mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
community[patch]: update the default hf bge embeddings (#22627)
**Description:** This updates the langchain_community > huggingface > default bge embeddings ([the current default recommends this change](https://huggingface.co/BAAI/bge-large-en)) **Issue:** None **Dependencies:** None **Twitter handle:** @jonzeolla --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
This commit is contained in:
parent
150251fd49
commit
78ff51ce83
@ -8,7 +8,7 @@ The Embeddings class is a class designed for interfacing with text embedding mod
|
||||
|
||||
Embeddings create a vector representation of a piece of text. This is useful because it means we can think about text in the vector space, and do things like semantic search where we look for pieces of text that are most similar in the vector space.
|
||||
|
||||
The base Embeddings class in LangChain provides two methods: one for embedding documents and one for embedding a query. The former, `.embed_documents`, takes as input multiple texts, while the latter, `.embed_query`, takes a single text. The reason for having these as two separate methods is that some embedding providers have different embedding methods for documents (to be searched over) vs queries (the search query itself).
|
||||
The base Embeddings class in LangChain provides two methods: one for embedding documents and one for embedding a query. The former, `.embed_documents`, takes as input multiple texts, while the latter, `.embed_query`, takes a single text. The reason for having these as two separate methods is that some embedding providers have different embedding methods for documents (to be searched over) vs queries (the search query itself).
|
||||
`.embed_query` will return a list of floats, whereas `.embed_documents` returns a list of lists of floats.
|
||||
|
||||
## Get started
|
||||
@ -94,15 +94,6 @@ from langchain_huggingface import HuggingFaceEmbeddings
|
||||
|
||||
embeddings_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
|
||||
```
|
||||
|
||||
You can also leave the `model_name` blank to use the default [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) model.
|
||||
|
||||
```python
|
||||
from langchain_huggingface import HuggingFaceEmbeddings
|
||||
|
||||
embeddings_model = HuggingFaceEmbeddings()
|
||||
```
|
||||
|
||||
</TabItem>
|
||||
</Tabs>
|
||||
|
||||
|
@ -54,7 +54,7 @@ from langchain_community.embeddings import HuggingFaceInstructEmbeddings
|
||||
|
||||
### HuggingFaceBgeEmbeddings
|
||||
|
||||
>[BGE models on the HuggingFace](https://huggingface.co/BAAI/bge-large-en) are [the best open-source embedding models](https://huggingface.co/spaces/mteb/leaderboard).
|
||||
>[BGE models on the HuggingFace](https://huggingface.co/BAAI/bge-large-en-v1.5) are one of [the best open-source embedding models](https://huggingface.co/spaces/mteb/leaderboard).
|
||||
>BGE model is created by the [Beijing Academy of Artificial Intelligence (BAAI)](https://en.wikipedia.org/wiki/Beijing_Academy_of_Artificial_Intelligence). `BAAI` is a private non-profit organization engaged in AI research and development.
|
||||
|
||||
See a [usage example](/docs/integrations/text_embedding/bge_huggingface).
|
||||
@ -86,10 +86,10 @@ from langchain_community.embeddings import HuggingFaceHubEmbeddings
|
||||
|
||||
### Hugging Face dataset
|
||||
|
||||
>[Hugging Face Hub](https://huggingface.co/docs/hub/index) is home to over 75,000
|
||||
> [datasets](https://huggingface.co/docs/hub/index#datasets) in more than 100 languages
|
||||
>[Hugging Face Hub](https://huggingface.co/docs/hub/index) is home to over 75,000
|
||||
> [datasets](https://huggingface.co/docs/hub/index#datasets) in more than 100 languages
|
||||
> that can be used for a broad range of tasks across NLP, Computer Vision, and Audio.
|
||||
> They used for a diverse range of tasks such as translation, automatic speech
|
||||
> They used for a diverse range of tasks such as translation, automatic speech
|
||||
> recognition, and image classification.
|
||||
|
||||
We need to install `datasets` python package.
|
||||
@ -110,7 +110,7 @@ from langchain_community.document_loaders.hugging_face_dataset import HuggingFac
|
||||
|
||||
### Hugging Face Hub Tools
|
||||
|
||||
>[Hugging Face Tools](https://huggingface.co/docs/transformers/v4.29.0/en/custom_tools)
|
||||
>[Hugging Face Tools](https://huggingface.co/docs/transformers/v4.29.0/en/custom_tools)
|
||||
> support text I/O and are loaded using the `load_huggingface_tool` function.
|
||||
|
||||
We need to install several python packages.
|
||||
|
@ -44,11 +44,12 @@ from langchain_community.vectorstores.vdms import VDMS_Client
|
||||
from langchain_huggingface import HuggingFaceEmbeddings
|
||||
|
||||
client = VDMS_Client("localhost", 55555)
|
||||
model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||
vectorstore = VDMS.from_documents(
|
||||
docs,
|
||||
client=client,
|
||||
collection_name="langchain-demo",
|
||||
embedding_function=HuggingFaceEmbeddings(),
|
||||
embedding_function=HuggingFaceEmbeddings(model_name=model_name),
|
||||
engine="FaissFlat"
|
||||
distance_strategy="L2",
|
||||
)
|
||||
@ -58,5 +59,3 @@ results = vectorstore.similarity_search(query)
|
||||
```
|
||||
|
||||
For a more detailed walkthrough of the VDMS wrapper, see [this notebook](/docs/integrations/vectorstores/vdms)
|
||||
|
||||
|
||||
|
@ -7,7 +7,7 @@
|
||||
"source": [
|
||||
"# BGE on Hugging Face\n",
|
||||
"\n",
|
||||
">[BGE models on the HuggingFace](https://huggingface.co/BAAI/bge-large-en) are [the best open-source embedding models](https://huggingface.co/spaces/mteb/leaderboard).\n",
|
||||
">[BGE models on the HuggingFace](https://huggingface.co/BAAI/bge-large-en-v1.5) are one of [the best open-source embedding models](https://huggingface.co/spaces/mteb/leaderboard).\n",
|
||||
">BGE model is created by the [Beijing Academy of Artificial Intelligence (BAAI)](https://en.wikipedia.org/wiki/Beijing_Academy_of_Artificial_Intelligence). `BAAI` is a private non-profit organization engaged in AI research and development.\n",
|
||||
"\n",
|
||||
"This notebook shows how to use `BGE Embeddings` through `Hugging Face`"
|
||||
|
@ -36,7 +36,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"embeddings = HuggingFaceEmbeddings()"
|
||||
"embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -57,7 +57,8 @@
|
||||
"from langchain_community.vectorstores import Annoy\n",
|
||||
"from langchain_huggingface import HuggingFaceEmbeddings\n",
|
||||
"\n",
|
||||
"embeddings_func = HuggingFaceEmbeddings()"
|
||||
"model_name = \"sentence-transformers/all-mpnet-base-v2\"\n",
|
||||
"embeddings_func = HuggingFaceEmbeddings(model_name=model_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -61,7 +61,8 @@
|
||||
"docs = text_splitter.split_documents(documents)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"embeddings = HuggingFaceEmbeddings()\n",
|
||||
"model_name = \"sentence-transformers/all-mpnet-base-v2\"\n",
|
||||
"embeddings = HuggingFaceEmbeddings(model_name=model_name)\n",
|
||||
"\n",
|
||||
"db = ScaNN.from_documents(docs, embeddings)\n",
|
||||
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
|
||||
|
@ -45,7 +45,8 @@
|
||||
"source": [
|
||||
"from langchain_huggingface import HuggingFaceEmbeddings\n",
|
||||
"\n",
|
||||
"embeddings = HuggingFaceEmbeddings()"
|
||||
"model_name = \"sentence-transformers/all-mpnet-base-v2\"\n",
|
||||
"embeddings = HuggingFaceEmbeddings(model_name=model_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -92,7 +92,8 @@
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"docs = text_splitter.split_documents(documents)\n",
|
||||
"\n",
|
||||
"embeddings = HuggingFaceEmbeddings()"
|
||||
"model_name = \"sentence-transformers/all-mpnet-base-v2\"\n",
|
||||
"embeddings = HuggingFaceEmbeddings(model_name=model_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -51,7 +51,8 @@
|
||||
"raw_documents = TextLoader(\"../../how_to/state_of_the_union.txt\").load()\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"documents = text_splitter.split_documents(raw_documents)\n",
|
||||
"embeddings = HuggingFaceEmbeddings()\n",
|
||||
"model_name = \"sentence-transformers/all-mpnet-base-v2\"\n",
|
||||
"embeddings = HuggingFaceEmbeddings(model_name=model_name)\n",
|
||||
"db = TileDB.from_documents(\n",
|
||||
" documents, embeddings, index_uri=\"/tmp/tiledb_index\", index_type=\"FLAT\"\n",
|
||||
")"
|
||||
|
@ -50,7 +50,8 @@
|
||||
"raw_documents = TextLoader(\"state_of_the_union.txt\").load()\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"documents = text_splitter.split_documents(raw_documents)\n",
|
||||
"embeddings = HuggingFaceEmbeddings()\n",
|
||||
"model_name = \"sentence-transformers/all-mpnet-base-v2\"\n",
|
||||
"embeddings = HuggingFaceEmbeddings(model_name=model_name)\n",
|
||||
"db = Vald.from_documents(documents, embeddings, host=\"localhost\", port=8080)"
|
||||
]
|
||||
},
|
||||
@ -197,7 +198,8 @@
|
||||
"raw_documents = TextLoader(\"state_of_the_union.txt\").load()\n",
|
||||
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
|
||||
"documents = text_splitter.split_documents(raw_documents)\n",
|
||||
"embeddings = HuggingFaceEmbeddings()\n",
|
||||
"model_name = \"sentence-transformers/all-mpnet-base-v2\"\n",
|
||||
"embeddings = HuggingFaceEmbeddings(model_name=model_name)\n",
|
||||
"\n",
|
||||
"db = Vald.from_documents(\n",
|
||||
" documents,\n",
|
||||
|
@ -200,7 +200,8 @@
|
||||
"\n",
|
||||
"\n",
|
||||
"# create the open-source embedding function\n",
|
||||
"embedding = HuggingFaceEmbeddings()\n",
|
||||
"model_name = \"sentence-transformers/all-mpnet-base-v2\"\n",
|
||||
"embedding = HuggingFaceEmbeddings(model_name=model_name)\n",
|
||||
"print(\n",
|
||||
" f\"# Embedding Dimensions: {len(embedding.embed_query('This is a test document.'))}\"\n",
|
||||
")"
|
||||
|
@ -67,6 +67,19 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
|
||||
def __init__(self, **kwargs: Any):
|
||||
"""Initialize the sentence_transformer."""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if "model_name" not in kwargs:
|
||||
since = "0.2.16"
|
||||
removal = "0.4.0"
|
||||
warn_deprecated(
|
||||
since=since,
|
||||
removal=removal,
|
||||
message=f"Default values for {self.__class__.__name__}.model_name"
|
||||
+ f" were deprecated in LangChain {since} and will be removed in"
|
||||
+ f" {removal}. Explicitly pass a model_name to the"
|
||||
+ f" {self.__class__.__name__} constructor instead.",
|
||||
)
|
||||
|
||||
try:
|
||||
import sentence_transformers
|
||||
|
||||
@ -159,6 +172,19 @@ class HuggingFaceInstructEmbeddings(BaseModel, Embeddings):
|
||||
def __init__(self, **kwargs: Any):
|
||||
"""Initialize the sentence_transformer."""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if "model_name" not in kwargs:
|
||||
since = "0.2.16"
|
||||
removal = "0.4.0"
|
||||
warn_deprecated(
|
||||
since=since,
|
||||
removal=removal,
|
||||
message=f"Default values for {self.__class__.__name__}.model_name"
|
||||
+ f" were deprecated in LangChain {since} and will be removed in"
|
||||
+ f" {removal}. Explicitly pass a model_name to the"
|
||||
+ f" {self.__class__.__name__} constructor instead.",
|
||||
)
|
||||
|
||||
try:
|
||||
from InstructorEmbedding import INSTRUCTOR
|
||||
|
||||
@ -231,7 +257,7 @@ class HuggingFaceBgeEmbeddings(BaseModel, Embeddings):
|
||||
|
||||
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
|
||||
|
||||
model_name = "BAAI/bge-large-en"
|
||||
model_name = "BAAI/bge-large-en-v1.5"
|
||||
model_kwargs = {'device': 'cpu'}
|
||||
encode_kwargs = {'normalize_embeddings': True}
|
||||
hf = HuggingFaceBgeEmbeddings(
|
||||
@ -279,6 +305,19 @@ class HuggingFaceBgeEmbeddings(BaseModel, Embeddings):
|
||||
def __init__(self, **kwargs: Any):
|
||||
"""Initialize the sentence_transformer."""
|
||||
super().__init__(**kwargs)
|
||||
|
||||
if "model_name" not in kwargs:
|
||||
since = "0.2.5"
|
||||
removal = "0.4.0"
|
||||
warn_deprecated(
|
||||
since=since,
|
||||
removal=removal,
|
||||
message=f"Default values for {self.__class__.__name__}.model_name"
|
||||
+ f" were deprecated in LangChain {since} and will be removed in"
|
||||
+ f" {removal}. Explicitly pass a model_name to the"
|
||||
+ f" {self.__class__.__name__} constructor instead.",
|
||||
)
|
||||
|
||||
try:
|
||||
import sentence_transformers
|
||||
|
||||
|
@ -303,7 +303,7 @@ class OpenVINOBgeEmbeddings(OpenVINOEmbeddings):
|
||||
|
||||
from langchain_community.embeddings import OpenVINOBgeEmbeddings
|
||||
|
||||
model_name = "BAAI/bge-large-en"
|
||||
model_name = "BAAI/bge-large-en-v1.5"
|
||||
model_kwargs = {'device': 'CPU'}
|
||||
encode_kwargs = {'normalize_embeddings': True}
|
||||
ov = OpenVINOBgeEmbeddings(
|
||||
|
@ -41,9 +41,10 @@ class ScaNN(VectorStore):
|
||||
from langchain_community.embeddings import HuggingFaceEmbeddings
|
||||
from langchain_community.vectorstores import ScaNN
|
||||
|
||||
model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||
db = ScaNN.from_texts(
|
||||
['foo', 'bar', 'barz', 'qux'],
|
||||
HuggingFaceEmbeddings())
|
||||
HuggingFaceEmbeddings(model_name=model_name))
|
||||
db.similarity_search('foo?', k=1)
|
||||
"""
|
||||
|
||||
|
@ -1,12 +1,5 @@
|
||||
import asyncio
|
||||
from typing import (
|
||||
Any,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
)
|
||||
from typing import Any, Dict, Iterable, List, Optional, Tuple
|
||||
|
||||
import numpy as np
|
||||
from langchain_core.documents import Document
|
||||
@ -40,7 +33,8 @@ class SurrealDBStore(VectorStore):
|
||||
from langchain_community.vectorstores.surrealdb import SurrealDBStore
|
||||
from langchain_community.embeddings import HuggingFaceEmbeddings
|
||||
|
||||
embedding_function = HuggingFaceEmbeddings()
|
||||
model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||
embedding_function = HuggingFaceEmbeddings(model_name=model_name)
|
||||
dburl = "ws://localhost:8000/rpc"
|
||||
ns = "langchain"
|
||||
db = "docstore"
|
||||
|
@ -23,10 +23,11 @@ class Vald(VectorStore):
|
||||
from langchain_community.embeddings import HuggingFaceEmbeddings
|
||||
from langchain_community.vectorstores import Vald
|
||||
|
||||
model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||
texts = ['foo', 'bar', 'baz']
|
||||
vald = Vald.from_texts(
|
||||
texts=texts,
|
||||
embedding=HuggingFaceEmbeddings(),
|
||||
embedding=HuggingFaceEmbeddings(model_name=model_name),
|
||||
host="localhost",
|
||||
port=8080,
|
||||
skip_strict_exist_check=False,
|
||||
|
@ -161,9 +161,10 @@ class VDMS(VectorStore):
|
||||
from langchain_huggingface import HuggingFaceEmbeddings
|
||||
from langchain_community.vectorstores.vdms import VDMS, VDMS_Client
|
||||
|
||||
model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||
vectorstore = VDMS(
|
||||
client=VDMS_Client("localhost", 55555),
|
||||
embedding=HuggingFaceEmbeddings(),
|
||||
embedding=HuggingFaceEmbeddings(model_name=model_name),
|
||||
collection_name="langchain-demo",
|
||||
distance_strategy="L2",
|
||||
engine="FaissFlat",
|
||||
|
@ -92,9 +92,10 @@ from langchain.chains.query_constructor.schema import AttributeInfo
|
||||
|
||||
from self_query_qdrant.chain import create_chain
|
||||
|
||||
model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||
chain = create_chain(
|
||||
llm=Cohere(),
|
||||
embeddings=HuggingFaceEmbeddings(),
|
||||
embeddings=HuggingFaceEmbeddings(model_name=model_name),
|
||||
document_contents="Descriptions of cats, along with their names and breeds.",
|
||||
metadata_field_info=[
|
||||
AttributeInfo(name="name", description="Name of the cat", type="string"),
|
||||
@ -112,8 +113,9 @@ from langchain_community.embeddings import HuggingFaceEmbeddings
|
||||
|
||||
from self_query_qdrant.chain import initialize
|
||||
|
||||
model_name = "sentence-transformers/all-mpnet-base-v2"
|
||||
initialize(
|
||||
embeddings=HuggingFaceEmbeddings(),
|
||||
embeddings=HuggingFaceEmbeddings(model_name=model_name),
|
||||
collection_name="cats",
|
||||
documents=[
|
||||
Document(
|
||||
@ -145,7 +147,7 @@ langchain serve
|
||||
|
||||
### Local Server
|
||||
|
||||
This will start the FastAPI app with a server running locally at
|
||||
This will start the FastAPI app with a server running locally at
|
||||
[http://localhost:8000](http://localhost:8000)
|
||||
|
||||
You can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs)
|
||||
|
Loading…
Reference in New Issue
Block a user