From e2d61ab85a1c6d4cc80cb1d605405653ccbf86cd Mon Sep 17 00:00:00 2001
From: rjarun8 <50106442+rjarun8@users.noreply.github.com>
Date: Mon, 3 Jul 2023 21:08:31 +0530
Subject: [PATCH] Add SpacyEmbeddings class (#6967)

- Description: Added a new SpacyEmbeddings class for generating
embeddings using the Spacy library.
- Issue: Sentencebert/Bert/Spacy/Doc2vec embedding support #6952
- Dependencies: This change requires the Spacy library and the
'en_core_web_sm' Spacy model.
- Tag maintainer: @dev2049
- Twitter handle: N/A

This change includes a new SpacyEmbeddings class, but does not include a
test or an example notebook.

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
---
 .../integrations/spacy_embedding.ipynb        | 126 ++++++++++++++++++
 langchain/embeddings/__init__.py              |   2 +
 langchain/embeddings/spacy_embeddings.py      | 114 ++++++++++++++++
 3 files changed, 242 insertions(+)
 create mode 100644 docs/extras/modules/data_connection/text_embedding/integrations/spacy_embedding.ipynb
 create mode 100644 langchain/embeddings/spacy_embeddings.py

diff --git a/docs/extras/modules/data_connection/text_embedding/integrations/spacy_embedding.ipynb b/docs/extras/modules/data_connection/text_embedding/integrations/spacy_embedding.ipynb
new file mode 100644
index 0000000000..0e83aaea70
--- /dev/null
+++ b/docs/extras/modules/data_connection/text_embedding/integrations/spacy_embedding.ipynb
@@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Spacy Embedding\n",
+    "\n",
+    "### Loading the Spacy embedding class to generate and query embeddings"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Import the necessary classes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from langchain.embeddings.spacy_embeddings import SpacyEmbeddings\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Initialize SpacyEmbeddings.This will load the Spacy model into memory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "embedder = SpacyEmbeddings()\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Define some example texts . These could be any documents that you want to analyze - for example, news articles, social media posts, or product reviews."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "texts = [\n",
+    "    \"The quick brown fox jumps over the lazy dog.\",\n",
+    "    \"Pack my box with five dozen liquor jugs.\",\n",
+    "    \"How vexingly quick daft zebras jump!\",\n",
+    "    \"Bright vixens jump; dozy fowl quack.\"\n",
+    "]\n",
+    "\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Generate and print embeddings for the texts . The SpacyEmbeddings class generates an embedding for each document, which is a numerical representation of the document's content. These embeddings can be used for various natural language processing tasks, such as document similarity comparison or text classification."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "embeddings = embedder.embed_documents(texts)\n",
+    "for i, embedding in enumerate(embeddings):\n",
+    "    print(f\"Embedding for document {i+1}: {embedding}\")\n",
+    "\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Generate and print an embedding for a single piece of text. You can also generate an embedding for a single piece of text, such as a search query. This can be useful for tasks like information retrieval, where you want to find documents that are similar to a given query."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "query = \"Quick foxes and lazy dogs.\"\n",
+    "query_embedding = embedder.embed_query(query)\n",
+    "print(f\"Embedding for query: {query_embedding}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/langchain/embeddings/__init__.py b/langchain/embeddings/__init__.py
index a492ab3b0c..7edd63e838 100644
--- a/langchain/embeddings/__init__.py
+++ b/langchain/embeddings/__init__.py
@@ -33,6 +33,7 @@ from langchain.embeddings.self_hosted_hugging_face import (
     SelfHostedHuggingFaceInstructEmbeddings,
 )
 from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+from langchain.embeddings.spacy_embeddings import SpacyEmbeddings
 from langchain.embeddings.tensorflow_hub import TensorflowHubEmbeddings
 from langchain.embeddings.vertexai import VertexAIEmbeddings
 
@@ -66,6 +67,7 @@ __all__ = [
     "DashScopeEmbeddings",
     "EmbaasEmbeddings",
     "OctoAIEmbeddings",
+    "SpacyEmbeddings",
 ]
 
 
diff --git a/langchain/embeddings/spacy_embeddings.py b/langchain/embeddings/spacy_embeddings.py
new file mode 100644
index 0000000000..66f4baa2f6
--- /dev/null
+++ b/langchain/embeddings/spacy_embeddings.py
@@ -0,0 +1,114 @@
+import importlib.util
+from typing import Any, Dict, List
+
+from pydantic import BaseModel, Extra, root_validator
+
+from langchain.embeddings.base import Embeddings
+
+
+class SpacyEmbeddings(BaseModel, Embeddings):
+    """
+    SpacyEmbeddings is a class for generating embeddings using the Spacy library.
+    It only supports the 'en_core_web_sm' model.
+
+    Attributes:
+        nlp (Any): The Spacy model loaded into memory.
+
+    Methods:
+        embed_documents(texts: List[str]) -> List[List[float]]:
+            Generates embeddings for a list of documents.
+        embed_query(text: str) -> List[float]:
+            Generates an embedding for a single piece of text.
+    """
+
+    nlp: Any  # The Spacy model loaded into memory
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.forbid  # Forbid extra attributes during model initialization
+
+    @root_validator(pre=True)
+    def validate_environment(cls, values: Dict) -> Dict:
+        """
+        Validates that the Spacy package and the 'en_core_web_sm' model are installed.
+
+        Args:
+            values (Dict): The values provided to the class constructor.
+
+        Returns:
+            The validated values.
+
+        Raises:
+            ValueError: If the Spacy package or the 'en_core_web_sm'
+            model are not installed.
+        """
+        # Check if the Spacy package is installed
+        if importlib.util.find_spec("spacy") is None:
+            raise ValueError(
+                "Spacy package not found. "
+                "Please install it with `pip install spacy`."
+            )
+        try:
+            # Try to load the 'en_core_web_sm' Spacy model
+            import spacy
+
+            values["nlp"] = spacy.load("en_core_web_sm")
+        except OSError:
+            # If the model is not found, raise a ValueError
+            raise ValueError(
+                "Spacy model 'en_core_web_sm' not found. "
+                "Please install it with"
+                " `python -m spacy download en_core_web_sm`."
+            )
+        return values  # Return the validated values
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """
+        Generates embeddings for a list of documents.
+
+        Args:
+            texts (List[str]): The documents to generate embeddings for.
+
+        Returns:
+            A list of embeddings, one for each document.
+        """
+        return [self.nlp(text).vector.tolist() for text in texts]
+
+    def embed_query(self, text: str) -> List[float]:
+        """
+        Generates an embedding for a single piece of text.
+
+        Args:
+            text (str): The text to generate an embedding for.
+
+        Returns:
+            The embedding for the text.
+        """
+        return self.nlp(text).vector.tolist()
+
+    async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
+        """
+        Asynchronously generates embeddings for a list of documents.
+        This method is not implemented and raises a NotImplementedError.
+
+        Args:
+            texts (List[str]): The documents to generate embeddings for.
+
+        Raises:
+            NotImplementedError: This method is not implemented.
+        """
+        raise NotImplementedError("Asynchronous embedding generation is not supported.")
+
+    async def aembed_query(self, text: str) -> List[float]:
+        """
+        Asynchronously generates an embedding for a single piece of text.
+        This method is not implemented and raises a NotImplementedError.
+
+        Args:
+            text (str): The text to generate an embedding for.
+
+        Raises:
+            NotImplementedError: This method is not implemented.
+        """
+        raise NotImplementedError("Asynchronous embedding generation is not supported.")