From a1ce7ab6721615a90082d92008ed03590b93ffcf Mon Sep 17 00:00:00 2001 From: Marina Pliusnina Date: Tue, 30 Jan 2024 05:30:34 +0100 Subject: [PATCH] adding parameter for changing the language in SpacyEmbeddings (#15743) Description: Added the parameter for a possibility to change a language model in SpacyEmbeddings. The default value is still the same: "en_core_web_sm", so it shouldn't affect a code which previously did not specify this parameter, but it is not hard-coded anymore and easy to change in case you want to use it with other languages or models. Issue: At Barcelona Supercomputing Center in Aina project (https://github.com/projecte-aina), a project for Catalan Language Models and Resources, we would like to use Langchain for one of our current projects and we would like to comment that Langchain, while being a very powerful and useful open-source tool, is pretty much focused on English language. We would like to contribute to make it a bit more adaptable for using with other languages. Dependencies: This change requires the Spacy library and a language model, specified in the model parameter. Tag maintainer: @dev2049 Twitter handle: @projecte_aina --------- Co-authored-by: Marina Pliusnina Co-authored-by: Harrison Chase --- .../text_embedding/spacy_embedding.ipynb | 2 +- .../embeddings/spacy_embeddings.py | 36 +++++++++++-------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/docs/docs/integrations/text_embedding/spacy_embedding.ipynb b/docs/docs/integrations/text_embedding/spacy_embedding.ipynb index a017d8f283..dfc8afc44a 100644 --- a/docs/docs/integrations/text_embedding/spacy_embedding.ipynb +++ b/docs/docs/integrations/text_embedding/spacy_embedding.ipynb @@ -52,7 +52,7 @@ "metadata": {}, "outputs": [], "source": [ - "embedder = SpacyEmbeddings()" + "embedder = SpacyEmbeddings(model_name=\"en_core_web_sm\")" ] }, { diff --git a/libs/community/langchain_community/embeddings/spacy_embeddings.py b/libs/community/langchain_community/embeddings/spacy_embeddings.py index eb581d7384..645d5afc96 100644 --- a/libs/community/langchain_community/embeddings/spacy_embeddings.py +++ b/libs/community/langchain_community/embeddings/spacy_embeddings.py @@ -1,17 +1,16 @@ import importlib.util -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional from langchain_core.embeddings import Embeddings from langchain_core.pydantic_v1 import BaseModel, Extra, root_validator class SpacyEmbeddings(BaseModel, Embeddings): - """Embeddings by SpaCy models. - - It only supports the 'en_core_web_sm' model. + """Embeddings by spaCy models. Attributes: - nlp (Any): The Spacy model loaded into memory. + model_name (str): Name of a spaCy model. + nlp (Any): The spaCy model loaded into memory. Methods: embed_documents(texts: List[str]) -> List[List[float]]: @@ -20,7 +19,8 @@ class SpacyEmbeddings(BaseModel, Embeddings): Generates an embedding for a single piece of text. """ - nlp: Any # The Spacy model loaded into memory + model_name: str = "en_core_web_sm" + nlp: Optional[Any] = None class Config: """Configuration for this pydantic object.""" @@ -30,7 +30,7 @@ class SpacyEmbeddings(BaseModel, Embeddings): @root_validator(pre=True) def validate_environment(cls, values: Dict) -> Dict: """ - Validates that the Spacy package and the 'en_core_web_sm' model are installed. + Validates that the spaCy package and the model are installed. Args: values (Dict): The values provided to the class constructor. @@ -39,26 +39,32 @@ class SpacyEmbeddings(BaseModel, Embeddings): The validated values. Raises: - ValueError: If the Spacy package or the 'en_core_web_sm' + ValueError: If the spaCy package or the model are not installed. """ - # Check if the Spacy package is installed + if values.get("model_name") is None: + values["model_name"] = "en_core_web_sm" + + model_name = values.get("model_name") + + # Check if the spaCy package is installed if importlib.util.find_spec("spacy") is None: raise ValueError( - "Spacy package not found. " + "SpaCy package not found. " "Please install it with `pip install spacy`." ) try: - # Try to load the 'en_core_web_sm' Spacy model + # Try to load the spaCy model import spacy - values["nlp"] = spacy.load("en_core_web_sm") + values["nlp"] = spacy.load(model_name) except OSError: # If the model is not found, raise a ValueError raise ValueError( - "Spacy model 'en_core_web_sm' not found. " - "Please install it with" - " `python -m spacy download en_core_web_sm`." + f"SpaCy model '{model_name}' not found. " + f"Please install it with" + f" `python -m spacy download {model_name}`" + "or provide a valid spaCy model name." ) return values # Return the validated values