Add SpacyEmbeddings class (#6967)

- Description: Added a new SpacyEmbeddings class for generating embeddings using the Spacy library. - Issue: Sentencebert/Bert/Spacy/Doc2vec embedding support #6952 - Dependencies: This change requires the Spacy library and the 'en_core_web_sm' Spacy model. - Tag maintainer: @dev2049 - Twitter handle: N/A This change includes a new SpacyEmbeddings class, but does not include a test or an example notebook. --------- Co-authored-by: Bagatur <baskaryan@gmail.com>
1 year ago · e2d61ab85a
parent 16fbd528c5
commit e2d61ab85a
3 changed files with 242 additions and 0 deletions
--- a/docs/extras/modules/data_connection/text_embedding/integrations/spacy_embedding.ipynb
+++ b/docs/extras/modules/data_connection/text_embedding/integrations/spacy_embedding.ipynb
@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Spacy Embedding\n",
+    "\n",
+    "### Loading the Spacy embedding class to generate and query embeddings"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Import the necessary classes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from langchain.embeddings.spacy_embeddings import SpacyEmbeddings\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Initialize SpacyEmbeddings.This will load the Spacy model into memory."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "embedder = SpacyEmbeddings()\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Define some example texts . These could be any documents that you want to analyze - for example, news articles, social media posts, or product reviews."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "\n",
+    "texts = [\n",
+    "    \"The quick brown fox jumps over the lazy dog.\",\n",
+    "    \"Pack my box with five dozen liquor jugs.\",\n",
+    "    \"How vexingly quick daft zebras jump!\",\n",
+    "    \"Bright vixens jump; dozy fowl quack.\"\n",
+    "]\n",
+    "\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Generate and print embeddings for the texts . The SpacyEmbeddings class generates an embedding for each document, which is a numerical representation of the document's content. These embeddings can be used for various natural language processing tasks, such as document similarity comparison or text classification."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "embeddings = embedder.embed_documents(texts)\n",
+    "for i, embedding in enumerate(embeddings):\n",
+    "    print(f\"Embedding for document {i+1}: {embedding}\")\n",
+    "\n"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Generate and print an embedding for a single piece of text. You can also generate an embedding for a single piece of text, such as a search query. This can be useful for tasks like information retrieval, where you want to find documents that are similar to a given query."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "query = \"Quick foxes and lazy dogs.\"\n",
+    "query_embedding = embedder.embed_query(query)\n",
+    "print(f\"Embedding for query: {query_embedding}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/langchain/embeddings/init.py
+++ b/langchain/embeddings/init.py
@ -33,6 +33,7 @@ from langchain.embeddings.self_hosted_hugging_face import (
    SelfHostedHuggingFaceInstructEmbeddings,
 )
 from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
+from langchain.embeddings.spacy_embeddings import SpacyEmbeddings
 from langchain.embeddings.tensorflow_hub import TensorflowHubEmbeddings
 from langchain.embeddings.vertexai import VertexAIEmbeddings

@ -66,6 +67,7 @@ __all__ = [
    "DashScopeEmbeddings",
    "EmbaasEmbeddings",
    "OctoAIEmbeddings",
+    "SpacyEmbeddings",
 ]


--- a/langchain/embeddings/spacy_embeddings.py
+++ b/langchain/embeddings/spacy_embeddings.py
@ -0,0 +1,114 @@
+import importlib.util
+from typing import Any, Dict, List
+
+from pydantic import BaseModel, Extra, root_validator
+
+from langchain.embeddings.base import Embeddings
+
+
+class SpacyEmbeddings(BaseModel, Embeddings):
+    """
+    SpacyEmbeddings is a class for generating embeddings using the Spacy library.
+    It only supports the 'en_core_web_sm' model.
+
+    Attributes:
+        nlp (Any): The Spacy model loaded into memory.
+
+    Methods:
+        embed_documents(texts: List[str]) -> List[List[float]]:
+            Generates embeddings for a list of documents.
+        embed_query(text: str) -> List[float]:
+            Generates an embedding for a single piece of text.
+    """
+
+    nlp: Any  # The Spacy model loaded into memory
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.forbid  # Forbid extra attributes during model initialization
+
+    @root_validator(pre=True)
+    def validate_environment(cls, values: Dict) -> Dict:
+        """
+        Validates that the Spacy package and the 'en_core_web_sm' model are installed.
+
+        Args:
+            values (Dict): The values provided to the class constructor.
+
+        Returns:
+            The validated values.
+
+        Raises:
+            ValueError: If the Spacy package or the 'en_core_web_sm'
+            model are not installed.
+        """
+        # Check if the Spacy package is installed
+        if importlib.util.find_spec("spacy") is None:
+            raise ValueError(
+                "Spacy package not found. "
+                "Please install it with `pip install spacy`."
+            )
+        try:
+            # Try to load the 'en_core_web_sm' Spacy model
+            import spacy
+
+            values["nlp"] = spacy.load("en_core_web_sm")
+        except OSError:
+            # If the model is not found, raise a ValueError
+            raise ValueError(
+                "Spacy model 'en_core_web_sm' not found. "
+                "Please install it with"
+                " `python -m spacy download en_core_web_sm`."
+            )
+        return values  # Return the validated values
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """
+        Generates embeddings for a list of documents.
+
+        Args:
+            texts (List[str]): The documents to generate embeddings for.
+
+        Returns:
+            A list of embeddings, one for each document.
+        """
+        return [self.nlp(text).vector.tolist() for text in texts]
+
+    def embed_query(self, text: str) -> List[float]:
+        """
+        Generates an embedding for a single piece of text.
+
+        Args:
+            text (str): The text to generate an embedding for.
+
+        Returns:
+            The embedding for the text.
+        """
+        return self.nlp(text).vector.tolist()
+
+    async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
+        """
+        Asynchronously generates embeddings for a list of documents.
+        This method is not implemented and raises a NotImplementedError.
+
+        Args:
+            texts (List[str]): The documents to generate embeddings for.
+
+        Raises:
+            NotImplementedError: This method is not implemented.
+        """
+        raise NotImplementedError("Asynchronous embedding generation is not supported.")
+
+    async def aembed_query(self, text: str) -> List[float]:
+        """
+        Asynchronously generates an embedding for a single piece of text.
+        This method is not implemented and raises a NotImplementedError.
+
+        Args:
+            text (str): The text to generate an embedding for.
+
+        Raises:
+            NotImplementedError: This method is not implemented.
+        """
+        raise NotImplementedError("Asynchronous embedding generation is not supported.")