From e2d61ab85a1c6d4cc80cb1d605405653ccbf86cd Mon Sep 17 00:00:00 2001 From: rjarun8 <50106442+rjarun8@users.noreply.github.com> Date: Mon, 3 Jul 2023 21:08:31 +0530 Subject: [PATCH] Add SpacyEmbeddings class (#6967) - Description: Added a new SpacyEmbeddings class for generating embeddings using the Spacy library. - Issue: Sentencebert/Bert/Spacy/Doc2vec embedding support #6952 - Dependencies: This change requires the Spacy library and the 'en_core_web_sm' Spacy model. - Tag maintainer: @dev2049 - Twitter handle: N/A This change includes a new SpacyEmbeddings class, but does not include a test or an example notebook. --------- Co-authored-by: Bagatur --- .../integrations/spacy_embedding.ipynb | 126 ++++++++++++++++++ langchain/embeddings/__init__.py | 2 + langchain/embeddings/spacy_embeddings.py | 114 ++++++++++++++++ 3 files changed, 242 insertions(+) create mode 100644 docs/extras/modules/data_connection/text_embedding/integrations/spacy_embedding.ipynb create mode 100644 langchain/embeddings/spacy_embeddings.py diff --git a/docs/extras/modules/data_connection/text_embedding/integrations/spacy_embedding.ipynb b/docs/extras/modules/data_connection/text_embedding/integrations/spacy_embedding.ipynb new file mode 100644 index 0000000000..0e83aaea70 --- /dev/null +++ b/docs/extras/modules/data_connection/text_embedding/integrations/spacy_embedding.ipynb @@ -0,0 +1,126 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Spacy Embedding\n", + "\n", + "### Loading the Spacy embedding class to generate and query embeddings" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Import the necessary classes" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from langchain.embeddings.spacy_embeddings import SpacyEmbeddings\n", + "\n", + "\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Initialize SpacyEmbeddings.This will load the Spacy model into memory." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "embedder = SpacyEmbeddings()\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Define some example texts . These could be any documents that you want to analyze - for example, news articles, social media posts, or product reviews." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "\n", + "texts = [\n", + " \"The quick brown fox jumps over the lazy dog.\",\n", + " \"Pack my box with five dozen liquor jugs.\",\n", + " \"How vexingly quick daft zebras jump!\",\n", + " \"Bright vixens jump; dozy fowl quack.\"\n", + "]\n", + "\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Generate and print embeddings for the texts . The SpacyEmbeddings class generates an embedding for each document, which is a numerical representation of the document's content. These embeddings can be used for various natural language processing tasks, such as document similarity comparison or text classification." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "embeddings = embedder.embed_documents(texts)\n", + "for i, embedding in enumerate(embeddings):\n", + " print(f\"Embedding for document {i+1}: {embedding}\")\n", + "\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Generate and print an embedding for a single piece of text. You can also generate an embedding for a single piece of text, such as a search query. This can be useful for tasks like information retrieval, where you want to find documents that are similar to a given query." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "query = \"Quick foxes and lazy dogs.\"\n", + "query_embedding = embedder.embed_query(query)\n", + "print(f\"Embedding for query: {query_embedding}\")" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/langchain/embeddings/__init__.py b/langchain/embeddings/__init__.py index a492ab3b0c..7edd63e838 100644 --- a/langchain/embeddings/__init__.py +++ b/langchain/embeddings/__init__.py @@ -33,6 +33,7 @@ from langchain.embeddings.self_hosted_hugging_face import ( SelfHostedHuggingFaceInstructEmbeddings, ) from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings +from langchain.embeddings.spacy_embeddings import SpacyEmbeddings from langchain.embeddings.tensorflow_hub import TensorflowHubEmbeddings from langchain.embeddings.vertexai import VertexAIEmbeddings @@ -66,6 +67,7 @@ __all__ = [ "DashScopeEmbeddings", "EmbaasEmbeddings", "OctoAIEmbeddings", + "SpacyEmbeddings", ] diff --git a/langchain/embeddings/spacy_embeddings.py b/langchain/embeddings/spacy_embeddings.py new file mode 100644 index 0000000000..66f4baa2f6 --- /dev/null +++ b/langchain/embeddings/spacy_embeddings.py @@ -0,0 +1,114 @@ +import importlib.util +from typing import Any, Dict, List + +from pydantic import BaseModel, Extra, root_validator + +from langchain.embeddings.base import Embeddings + + +class SpacyEmbeddings(BaseModel, Embeddings): + """ + SpacyEmbeddings is a class for generating embeddings using the Spacy library. + It only supports the 'en_core_web_sm' model. + + Attributes: + nlp (Any): The Spacy model loaded into memory. + + Methods: + embed_documents(texts: List[str]) -> List[List[float]]: + Generates embeddings for a list of documents. + embed_query(text: str) -> List[float]: + Generates an embedding for a single piece of text. + """ + + nlp: Any # The Spacy model loaded into memory + + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid # Forbid extra attributes during model initialization + + @root_validator(pre=True) + def validate_environment(cls, values: Dict) -> Dict: + """ + Validates that the Spacy package and the 'en_core_web_sm' model are installed. + + Args: + values (Dict): The values provided to the class constructor. + + Returns: + The validated values. + + Raises: + ValueError: If the Spacy package or the 'en_core_web_sm' + model are not installed. + """ + # Check if the Spacy package is installed + if importlib.util.find_spec("spacy") is None: + raise ValueError( + "Spacy package not found. " + "Please install it with `pip install spacy`." + ) + try: + # Try to load the 'en_core_web_sm' Spacy model + import spacy + + values["nlp"] = spacy.load("en_core_web_sm") + except OSError: + # If the model is not found, raise a ValueError + raise ValueError( + "Spacy model 'en_core_web_sm' not found. " + "Please install it with" + " `python -m spacy download en_core_web_sm`." + ) + return values # Return the validated values + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """ + Generates embeddings for a list of documents. + + Args: + texts (List[str]): The documents to generate embeddings for. + + Returns: + A list of embeddings, one for each document. + """ + return [self.nlp(text).vector.tolist() for text in texts] + + def embed_query(self, text: str) -> List[float]: + """ + Generates an embedding for a single piece of text. + + Args: + text (str): The text to generate an embedding for. + + Returns: + The embedding for the text. + """ + return self.nlp(text).vector.tolist() + + async def aembed_documents(self, texts: List[str]) -> List[List[float]]: + """ + Asynchronously generates embeddings for a list of documents. + This method is not implemented and raises a NotImplementedError. + + Args: + texts (List[str]): The documents to generate embeddings for. + + Raises: + NotImplementedError: This method is not implemented. + """ + raise NotImplementedError("Asynchronous embedding generation is not supported.") + + async def aembed_query(self, text: str) -> List[float]: + """ + Asynchronously generates an embedding for a single piece of text. + This method is not implemented and raises a NotImplementedError. + + Args: + text (str): The text to generate an embedding for. + + Raises: + NotImplementedError: This method is not implemented. + """ + raise NotImplementedError("Asynchronous embedding generation is not supported.")