From ea1ab391d421ff054def570109f3ca511536ef3d Mon Sep 17 00:00:00 2001 From: Lance Martin <122662504+rlancemartin@users.noreply.github.com> Date: Fri, 3 Nov 2023 13:33:36 -0700 Subject: [PATCH] Open Clip multimodal embeddings (#12754) --- .../text_embedding/open_clip.ipynb | 145 ++++++++++++++++++ .../langchain/embeddings/__init__.py | 2 + .../langchain/embeddings/open_clip.py | 56 +++++++ .../unit_tests/embeddings/test_imports.py | 1 + 4 files changed, 204 insertions(+) create mode 100644 docs/docs/integrations/text_embedding/open_clip.ipynb create mode 100644 libs/langchain/langchain/embeddings/open_clip.py diff --git a/docs/docs/integrations/text_embedding/open_clip.ipynb b/docs/docs/integrations/text_embedding/open_clip.ipynb new file mode 100644 index 0000000000..e00315ddbc --- /dev/null +++ b/docs/docs/integrations/text_embedding/open_clip.ipynb @@ -0,0 +1,145 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "bb9b2af6-325f-4d1e-8e74-96ca5c2e27c5", + "metadata": {}, + "source": [ + "# OpenClip\n", + "\n", + "[OpenClip](https://github.com/mlfoundations/open_clip/tree/main) is an source implementation of OpenAI's CLIP.\n", + "\n", + "These multi-modal embeddings can be used to embed images or text.\n", + "\n", + "For text, use the same method `embed_documents` as with other embedding models.\n", + "\n", + "For images, use `embed_image` and simply pass a numpy array." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "570d818f-5705-4532-8e77-b0f335bb515d", + "metadata": {}, + "outputs": [], + "source": [ + "! pip install pillow open_clip_torch torch" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "c4fc9af7-5659-4008-b6a3-0f99c68324aa", + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import seaborn as sns\n", + "from PIL import Image as _PILImage\n", + "from langchain.embeddings import OpenCLIPEmbeddings\n", + "\n", + "# Images\n", + "img_path_dog='/Users/rlm/Desktop/Papers/LLaVA/dog.jpeg'\n", + "img_path_house='/Users/rlm/Desktop/Papers/LLaVA/house.jpeg'\n", + "\n", + "# Load images and convert to numpy arrays\n", + "image_np_dog = np.array(_PILImage.open(img_path_dog).convert(\"RGB\"))\n", + "image_np_house = np.array(_PILImage.open(img_path_house).convert(\"RGB\"))\n", + "\n", + "# Embe images or text\n", + "clip_embd = OpenCLIPEmbeddings()\n", + "img_feat_dog = clip_embd.embed_image([image_np_dog])\n", + "img_feat_house = clip_embd.embed_image([image_np_house])\n", + "text_feat_dog = clip_embd.embed_documents([\"dog\"])\n", + "text_feat_house = clip_embd.embed_documents([\"house\"])" + ] + }, + { + "cell_type": "markdown", + "id": "52b5e8d0-d1c7-475f-9d33-b8a1c492398a", + "metadata": {}, + "source": [ + "## Sanity check\n", + "\n", + "We can check simiarlity. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "6d7962f3-8f05-463d-97b2-955eaa89d18d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2, 2)\n" + ] + }, + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# Convert to numpy arrays\n", + "img_feat_dog_np = np.array(img_feat_dog[0])\n", + "img_feat_house_np = np.array(img_feat_house[0])\n", + "text_feat_dog_np = np.array(text_feat_dog[0])\n", + "text_feat_house_np = np.array(text_feat_house[0])\n", + "\n", + "# Compute similarity\n", + "similarities = np.array([\n", + " [text_feat_dog_np @ img_feat_dog_np.T][0][0], \n", + " [text_feat_dog_np @ img_feat_house_np.T][0][0],\n", + " [text_feat_house_np @ img_feat_dog_np.T][0][0], \n", + " [text_feat_house_np @ img_feat_house_np.T][0][0]\n", + "]).reshape(2, 2)\n", + "\n", + "# Ensure similarities is of shape (2, 2)\n", + "print(similarities.shape) # Expected: (2, 2)\n", + "\n", + "# Plot heatmap\n", + "sns.heatmap(similarities, annot=True, cmap='viridis', xticklabels=['dog image', 'house image'], yticklabels=['dog text', 'house text'])\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.16" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/langchain/langchain/embeddings/__init__.py b/libs/langchain/langchain/embeddings/__init__.py index fb9c7649d3..9098acd527 100644 --- a/libs/langchain/langchain/embeddings/__init__.py +++ b/libs/langchain/langchain/embeddings/__init__.py @@ -53,6 +53,7 @@ from langchain.embeddings.mosaicml import MosaicMLInstructorEmbeddings from langchain.embeddings.nlpcloud import NLPCloudEmbeddings from langchain.embeddings.octoai_embeddings import OctoAIEmbeddings from langchain.embeddings.ollama import OllamaEmbeddings +from langchain.embeddings.open_clip import OpenCLIPEmbeddings from langchain.embeddings.openai import OpenAIEmbeddings from langchain.embeddings.sagemaker_endpoint import SagemakerEndpointEmbeddings from langchain.embeddings.self_hosted import SelfHostedEmbeddings @@ -117,6 +118,7 @@ __all__ = [ "QianfanEmbeddingsEndpoint", "JohnSnowLabsEmbeddings", "VoyageEmbeddings", + "OpenCLIPEmbeddings", ] diff --git a/libs/langchain/langchain/embeddings/open_clip.py b/libs/langchain/langchain/embeddings/open_clip.py new file mode 100644 index 0000000000..c19cabb253 --- /dev/null +++ b/libs/langchain/langchain/embeddings/open_clip.py @@ -0,0 +1,56 @@ +from typing import Any, Dict, List + +import numpy as np + +from langchain.pydantic_v1 import BaseModel, root_validator +from langchain.schema.embeddings import Embeddings + + +class OpenCLIPEmbeddings(BaseModel, Embeddings): + model: Any + preprocess: Any + tokenizer: Any + + @root_validator() + def validate_environment(cls, values: Dict) -> Dict: + """Validate that open_clip and torch libraries are installed.""" + try: + import open_clip + + model_name = "ViT-B-32" + checkpoint = "laion2b_s34b_b79k" + model, _, preprocess = open_clip.create_model_and_transforms( + model_name=model_name, pretrained=checkpoint + ) + tokenizer = open_clip.get_tokenizer(model_name) + values["model"] = model + values["preprocess"] = preprocess + values["tokenizer"] = tokenizer + + except ImportError: + raise ImportError( + "Please ensure both open_clip and torch libraries are installed. " + "pip install open_clip_torch torch" + ) + return values + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + text_features = [ + self.model.encode_text(self.tokenizer(text)).tolist() for text in texts + ] + return text_features + + def embed_query(self, text: str) -> List[float]: + return self.embed_documents([text])[0] + + def embed_image(self, images: List[np.ndarray]) -> List[List[float]]: + try: + from PIL import Image as _PILImage + except ImportError: + raise ImportError("Please install the PIL library: pip install pillow") + pil_images = [_PILImage.fromarray(image) for image in images] + image_features = [ + self.model.encode_image(self.preprocess(pil_image).unsqueeze(0)).tolist() + for pil_image in pil_images + ] + return image_features diff --git a/libs/langchain/tests/unit_tests/embeddings/test_imports.py b/libs/langchain/tests/unit_tests/embeddings/test_imports.py index 8d05d3634e..dd4f1aebe8 100644 --- a/libs/langchain/tests/unit_tests/embeddings/test_imports.py +++ b/libs/langchain/tests/unit_tests/embeddings/test_imports.py @@ -48,6 +48,7 @@ EXPECTED_ALL = [ "QianfanEmbeddingsEndpoint", "JohnSnowLabsEmbeddings", "VoyageEmbeddings", + "OpenCLIPEmbeddings", ]