openai[patch]: fix special token default behavior (#21131)

By default handle special sequences as regular text
pull/21093/head^2^2
Bagatur 1 month ago committed by GitHub
parent 0f7f448603
commit bef50ded63
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -82,8 +82,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
"""Automatically inferred from env var `OPENAI_API_KEY` if not provided."""
openai_organization: Optional[str] = Field(default=None, alias="organization")
"""Automatically inferred from env var `OPENAI_ORG_ID` if not provided."""
allowed_special: Union[Literal["all"], Set[str]] = set()
disallowed_special: Union[Literal["all"], Set[str], Sequence[str]] = "all"
allowed_special: Union[Literal["all"], Set[str], None] = None
disallowed_special: Union[Literal["all"], Set[str], Sequence[str], None] = None
chunk_size: int = 1000
"""Maximum number of texts to embed in each batch"""
max_retries: int = 2
@ -246,31 +246,12 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
params["dimensions"] = self.dimensions
return params
# please refer to
# https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb
def _get_len_safe_embeddings(
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
) -> List[List[float]]:
"""
Generate length-safe embeddings for a list of texts.
This method handles tokenization and embedding generation, respecting the
set embedding context length and chunk size. It supports both tiktoken
and HuggingFace tokenizer based on the tiktoken_enabled flag.
Args:
texts (List[str]): A list of texts to embed.
engine (str): The engine or model to use for embeddings.
chunk_size (Optional[int]): The size of chunks for processing embeddings.
Returns:
List[List[float]]: A list of embeddings for each input text.
"""
def _tokenize(
self, texts: List[str], chunk_size: int
) -> Tuple[Iterable[int], List[List[float]], List[int]]:
tokens = []
indices = []
model_name = self.tiktoken_model_name or self.model
_chunk_size = chunk_size or self.chunk_size
# If tiktoken flag set to False
if not self.tiktoken_enabled:
@ -303,6 +284,14 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
encoding = tiktoken.encoding_for_model(model_name)
except KeyError:
encoding = tiktoken.get_encoding("cl100k_base")
encoder_kwargs: Dict[str, Any] = {
k: v
for k, v in {
"allowed_special": self.allowed_special,
"disallowed_special": self.disallowed_special,
}.items()
if v is not None
}
for i, text in enumerate(texts):
if self.model.endswith("001"):
# See: https://github.com/openai/openai-python/
@ -310,11 +299,10 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
# replace newlines, which can negatively affect performance.
text = text.replace("\n", " ")
token = encoding.encode(
text=text,
allowed_special=self.allowed_special,
disallowed_special=self.disallowed_special,
)
if encoder_kwargs:
token = encoding.encode(text, **encoder_kwargs)
else:
token = encoding.encode_ordinary(text)
# Split tokens into chunks respecting the embedding_ctx_length
for j in range(0, len(token), self.embedding_ctx_length):
@ -325,12 +313,35 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
try:
from tqdm.auto import tqdm
_iter: Iterable = tqdm(range(0, len(tokens), _chunk_size))
_iter: Iterable = tqdm(range(0, len(tokens), chunk_size))
except ImportError:
_iter = range(0, len(tokens), _chunk_size)
_iter = range(0, len(tokens), chunk_size)
else:
_iter = range(0, len(tokens), _chunk_size)
_iter = range(0, len(tokens), chunk_size)
return _iter, tokens, indices
# please refer to
# https://github.com/openai/openai-cookbook/blob/main/examples/Embedding_long_inputs.ipynb
def _get_len_safe_embeddings(
self, texts: List[str], *, engine: str, chunk_size: Optional[int] = None
) -> List[List[float]]:
"""
Generate length-safe embeddings for a list of texts.
This method handles tokenization and embedding generation, respecting the
set embedding context length and chunk size. It supports both tiktoken
and HuggingFace tokenizer based on the tiktoken_enabled flag.
Args:
texts (List[str]): A list of texts to embed.
engine (str): The engine or model to use for embeddings.
chunk_size (Optional[int]): The size of chunks for processing embeddings.
Returns:
List[List[float]]: A list of embeddings for each input text.
"""
_chunk_size = chunk_size or self.chunk_size
_iter, tokens, indices = self._tokenize(texts, _chunk_size)
batched_embeddings: List[List[float]] = []
for i in _iter:
response = self.client.create(
@ -399,62 +410,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
List[List[float]]: A list of embeddings for each input text.
"""
tokens = []
indices = []
model_name = self.tiktoken_model_name or self.model
_chunk_size = chunk_size or self.chunk_size
# If tiktoken flag set to False
if not self.tiktoken_enabled:
try:
from transformers import AutoTokenizer
except ImportError:
raise ValueError(
"Could not import transformers python package. "
"This is needed in order to for OpenAIEmbeddings without "
" `tiktoken`. Please install it with `pip install transformers`."
)
tokenizer = AutoTokenizer.from_pretrained(
pretrained_model_name_or_path=model_name
)
for i, text in enumerate(texts):
# Tokenize the text using HuggingFace transformers
tokenized = tokenizer.encode(text, add_special_tokens=False)
# Split tokens into chunks respecting the embedding_ctx_length
for j in range(0, len(tokenized), self.embedding_ctx_length):
token_chunk = tokenized[j : j + self.embedding_ctx_length]
# Convert token IDs back to a string
chunk_text = tokenizer.decode(token_chunk)
tokens.append(chunk_text)
indices.append(i)
else:
try:
encoding = tiktoken.encoding_for_model(model_name)
except KeyError:
logger.warning("Warning: model not found. Using cl100k_base encoding.")
model = "cl100k_base"
encoding = tiktoken.get_encoding(model)
for i, text in enumerate(texts):
if self.model.endswith("001"):
# See: https://github.com/openai/openai-python/
# issues/418#issuecomment-1525939500
# replace newlines, which can negatively affect performance.
text = text.replace("\n", " ")
token = encoding.encode(
text=text,
allowed_special=self.allowed_special,
disallowed_special=self.disallowed_special,
)
# Split tokens into chunks respecting the embedding_ctx_length
for j in range(0, len(token), self.embedding_ctx_length):
tokens.append(token[j : j + self.embedding_ctx_length])
indices.append(i)
_iter, tokens, indices = self._tokenize(texts, _chunk_size)
batched_embeddings: List[List[float]] = []
_chunk_size = chunk_size or self.chunk_size
for i in range(0, len(tokens), _chunk_size):

@ -1286,4 +1286,4 @@ watchmedo = ["PyYAML (>=3.10)"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "f8a406a4ebd93e5c2ef3fcf4a3cebdd588ce09e288dc31b7b9b6b1560285575a"
content-hash = "1d9cefc90178d94dee2a09afc14af160a7e35e4972ad4701d3bbbfdde14a81fa"

@ -29,6 +29,7 @@ pytest-asyncio = "^0.21.1"
langchain-core = { path = "../../core", develop = true }
pytest-cov = "^4.1.0"
langchain-standard-tests = { path = "../../standard-tests", develop = true }
numpy = "^1.24"
[tool.poetry.group.codespell]
optional = true

@ -1,4 +1,7 @@
"""Test OpenAI embeddings."""
import numpy as np
import openai
from langchain_openai.embeddings.base import OpenAIEmbeddings
@ -26,3 +29,31 @@ def test_langchain_openai_embeddings_dimensions() -> None:
output = embedding.embed_documents(documents)
assert len(output) == 1
assert len(output[0]) == 128
def test_langchain_openai_embeddings_equivalent_to_raw() -> None:
documents = ["disallowed special token '<|endoftext|>'"]
embedding = OpenAIEmbeddings()
lc_output = embedding.embed_documents(documents)[0]
direct_output = (
openai.OpenAI()
.embeddings.create(input=documents, model=embedding.model)
.data[0]
.embedding
)
assert np.isclose(lc_output, direct_output).all()
async def test_langchain_openai_embeddings_equivalent_to_raw_async() -> None:
documents = ["disallowed special token '<|endoftext|>'"]
embedding = OpenAIEmbeddings()
lc_output = (await embedding.aembed_documents(documents))[0]
client = openai.AsyncOpenAI()
direct_output = (
(await client.embeddings.create(input=documents, model=embedding.model))
.data[0]
.embedding
)
assert np.isclose(lc_output, direct_output).all()

Loading…
Cancel
Save