@ -178,7 +178,7 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
openai . api_type = openai_api_type
values [ " client " ] = openai . Embedding
except ImportError :
raise Value Error(
raise Import Error(
" Could not import openai python package. "
" Please install it with `pip install openai`. "
)
@ -192,66 +192,63 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
embeddings : List [ List [ float ] ] = [ [ ] for _ in range ( len ( texts ) ) ]
try :
import tiktoken
except ImportError :
raise ImportError (
" Could not import tiktoken python package. "
" This is needed in order to for OpenAIEmbeddings. "
" Please install it with `pip install tiktoken`. "
)
tokens = [ ]
indices = [ ]
encoding = tiktoken . model . encoding_for_model ( self . model )
for i , text in enumerate ( texts ) :
if self . model . endswith ( " 001 " ) :
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
# replace newlines, which can negatively affect performance.
text = text . replace ( " \n " , " " )
token = encoding . encode (
text ,
allowed_special = self . allowed_special ,
disallowed_special = self . disallowed_special ,
)
for j in range ( 0 , len ( token ) , self . embedding_ctx_length ) :
tokens + = [ token [ j : j + self . embedding_ctx_length ] ]
indices + = [ i ]
batched_embeddings = [ ]
_chunk_size = chunk_size or self . chunk_size
for i in range ( 0 , len ( tokens ) , _chunk_size ) :
response = embed_with_retry (
tokens = [ ]
indices = [ ]
encoding = tiktoken . model . encoding_for_model ( self . model )
for i , text in enumerate ( texts ) :
if self . model . endswith ( " 001 " ) :
# See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500
# replace newlines, which can negatively affect performance.
text = text . replace ( " \n " , " " )
token = encoding . encode (
text ,
allowed_special = self . allowed_special ,
disallowed_special = self . disallowed_special ,
)
for j in range ( 0 , len ( token ) , self . embedding_ctx_length ) :
tokens + = [ token [ j : j + self . embedding_ctx_length ] ]
indices + = [ i ]
batched_embeddings = [ ]
_chunk_size = chunk_size or self . chunk_size
for i in range ( 0 , len ( tokens ) , _chunk_size ) :
response = embed_with_retry (
self ,
input = tokens [ i : i + _chunk_size ] ,
engine = self . deployment ,
request_timeout = self . request_timeout ,
headers = self . headers ,
)
batched_embeddings + = [ r [ " embedding " ] for r in response [ " data " ] ]
results : List [ List [ List [ float ] ] ] = [ [ ] for _ in range ( len ( texts ) ) ]
num_tokens_in_batch : List [ List [ int ] ] = [ [ ] for _ in range ( len ( texts ) ) ]
for i in range ( len ( indices ) ) :
results [ indices [ i ] ] . append ( batched_embeddings [ i ] )
num_tokens_in_batch [ indices [ i ] ] . append ( len ( tokens [ i ] ) )
for i in range ( len ( texts ) ) :
_result = results [ i ]
if len ( _result ) == 0 :
average = embed_with_retry (
self ,
input = tokens [ i : i + _chunk_size ] ,
input = " " ,
engine = self . deployment ,
request_timeout = self . request_timeout ,
headers = self . headers ,
)
batched_embeddings + = [ r [ " embedding " ] for r in response [ " data " ] ]
results : List [ List [ List [ float ] ] ] = [ [ ] for _ in range ( len ( texts ) ) ]
num_tokens_in_batch : List [ List [ int ] ] = [ [ ] for _ in range ( len ( texts ) ) ]
for i in range ( len ( indices ) ) :
results [ indices [ i ] ] . append ( batched_embeddings [ i ] )
num_tokens_in_batch [ indices [ i ] ] . append ( len ( tokens [ i ] ) )
for i in range ( len ( texts ) ) :
_result = results [ i ]
if len ( _result ) == 0 :
average = embed_with_retry (
self ,
input = " " ,
engine = self . deployment ,
request_timeout = self . request_timeout ,
headers = self . headers ,
) [ " data " ] [ 0 ] [ " embedding " ]
else :
average = np . average (
_result , axis = 0 , weights = num_tokens_in_batch [ i ]
)
embeddings [ i ] = ( average / np . linalg . norm ( average ) ) . tolist ( )
return embeddings
) [ " data " ] [ 0 ] [ " embedding " ]
else :
average = np . average ( _result , axis = 0 , weights = num_tokens_in_batch [ i ] )
embeddings [ i ] = ( average / np . linalg . norm ( average ) ) . tolist ( )
except ImportError :
raise ValueError (
" Could not import tiktoken python package. "
" This is needed in order to for OpenAIEmbeddings. "
" Please install it with `pip install tiktoken`. "
)
return embeddings
def _embedding_func ( self , text : str , * , engine : str ) - > List [ float ] :
""" Call out to OpenAI ' s embedding endpoint. """