From a5a4999fb7c33e5c50b5db94ffa57c9985b27608 Mon Sep 17 00:00:00 2001 From: Rafal Wojdyla Date: Tue, 2 May 2023 04:09:20 +0100 Subject: [PATCH] New line should be remove only for the 1st gen embedding models (#3853) Only 1st generation OpenAI embeddings models are negatively impacted by new lines. Context: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 --- langchain/embeddings/openai.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/langchain/embeddings/openai.py b/langchain/embeddings/openai.py index c10ffc60..c9a5065d 100644 --- a/langchain/embeddings/openai.py +++ b/langchain/embeddings/openai.py @@ -158,8 +158,10 @@ class OpenAIEmbeddings(BaseModel, Embeddings): indices = [] encoding = tiktoken.model.encoding_for_model(self.model) for i, text in enumerate(texts): - # replace newlines, which can negatively affect performance. - text = text.replace("\n", " ") + if self.model.endswith("001"): + # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 + # replace newlines, which can negatively affect performance. + text = text.replace("\n", " ") token = encoding.encode( text, allowed_special=self.allowed_special, @@ -212,8 +214,10 @@ class OpenAIEmbeddings(BaseModel, Embeddings): if len(text) > self.embedding_ctx_length: return self._get_len_safe_embeddings([text], engine=engine)[0] else: - # replace newlines, which can negatively affect performance. - text = text.replace("\n", " ") + if self.model.endswith("001"): + # See: https://github.com/openai/openai-python/issues/418#issuecomment-1525939500 + # replace newlines, which can negatively affect performance. + text = text.replace("\n", " ") return embed_with_retry(self, input=[text], engine=engine)["data"][0][ "embedding" ]