langchain/libs/community/langchain_community/embeddings/johnsnowlabs.py

import os
import sys
from typing import Any, List

from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import BaseModel, Extra


class JohnSnowLabsEmbeddings(BaseModel, Embeddings):
    """JohnSnowLabs embedding models

    To use, you should have the ``johnsnowlabs`` python package installed.
    Example:
        .. code-block:: python

            from langchain_community.embeddings.johnsnowlabs import JohnSnowLabsEmbeddings

            embedding = JohnSnowLabsEmbeddings(model='embed_sentence.bert')
            output = embedding.embed_query("foo bar")
    """  # noqa: E501

    model: Any = "embed_sentence.bert"

    def __init__(
        self,
        model: Any = "embed_sentence.bert",
        hardware_target: str = "cpu",
        **kwargs: Any,
    ):
        """Initialize the johnsnowlabs model."""
        super().__init__(**kwargs)
        # 1) Check imports
        try:
            from johnsnowlabs import nlp
            from nlu.pipe.pipeline import NLUPipeline
        except ImportError as exc:
            raise ImportError(
                "Could not import johnsnowlabs python package. "
                "Please install it with `pip install johnsnowlabs`."
            ) from exc

        # 2) Start a Spark Session
        try:
            os.environ["PYSPARK_PYTHON"] = sys.executable
            os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable
            nlp.start(hardware_target=hardware_target)
        except Exception as exc:
            raise Exception("Failure starting Spark Session") from exc

        # 3) Load the model
        try:
            if isinstance(model, str):
                self.model = nlp.load(model)
            elif isinstance(model, NLUPipeline):
                self.model = model
            else:
                self.model = nlp.to_nlu_pipe(model)
        except Exception as exc:
            raise Exception("Failure loading model") from exc

    class Config:
        """Configuration for this pydantic object."""

        extra = Extra.forbid

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """Compute doc embeddings using a JohnSnowLabs transformer model.

        Args:
            texts: The list of texts to embed.

        Returns:
            List of embeddings, one for each text.
        """

        df = self.model.predict(texts, output_level="document")
        emb_col = None
        for c in df.columns:
            if "embedding" in c:
                emb_col = c
        return [vec.tolist() for vec in df[emb_col].tolist()]

    def embed_query(self, text: str) -> List[float]:
        """Compute query embeddings using a JohnSnowLabs transformer model.

        Args:
            text: The text to embed.

        Returns:
            Embeddings for the text.
        """
        return self.embed_documents([text])[0]
community[major], core[patch], langchain[patch], experimental[patch]: Create langchain-community (#14463) Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes 2023-12-11 21:53:30 +00:00			`import os`
			`import sys`
			`from typing import Any, List`

			`from langchain_core.embeddings import Embeddings`
			`from langchain_core.pydantic_v1 import BaseModel, Extra`


			`class JohnSnowLabsEmbeddings(BaseModel, Embeddings):`
			`"""JohnSnowLabs embedding models`

			To use, you should have the ``johnsnowlabs`` python package installed.
			`Example:`
			`.. code-block:: python`

			`from langchain_community.embeddings.johnsnowlabs import JohnSnowLabsEmbeddings`

			`embedding = JohnSnowLabsEmbeddings(model='embed_sentence.bert')`
			`output = embedding.embed_query("foo bar")`
			`""" # noqa: E501`

			`model: Any = "embed_sentence.bert"`

			`def __init__(`
			`self,`
			`model: Any = "embed_sentence.bert",`
			`hardware_target: str = "cpu",`
			`**kwargs: Any,`
			`):`
			`"""Initialize the johnsnowlabs model."""`
			`super().__init__(**kwargs)`
			`# 1) Check imports`
			`try:`
			`from johnsnowlabs import nlp`
			`from nlu.pipe.pipeline import NLUPipeline`
			`except ImportError as exc:`
			`raise ImportError(`
			`"Could not import johnsnowlabs python package. "`
			"Please install it with `pip install johnsnowlabs`."
			`) from exc`

			`# 2) Start a Spark Session`
			`try:`
			`os.environ["PYSPARK_PYTHON"] = sys.executable`
			`os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable`
			`nlp.start(hardware_target=hardware_target)`
			`except Exception as exc:`
			`raise Exception("Failure starting Spark Session") from exc`

			`# 3) Load the model`
			`try:`
			`if isinstance(model, str):`
			`self.model = nlp.load(model)`
			`elif isinstance(model, NLUPipeline):`
			`self.model = model`
			`else:`
			`self.model = nlp.to_nlu_pipe(model)`
			`except Exception as exc:`
			`raise Exception("Failure loading model") from exc`

			`class Config:`
			`"""Configuration for this pydantic object."""`

			`extra = Extra.forbid`

			`def embed_documents(self, texts: List[str]) -> List[List[float]]:`
			`"""Compute doc embeddings using a JohnSnowLabs transformer model.`

			`Args:`
			`texts: The list of texts to embed.`

			`Returns:`
			`List of embeddings, one for each text.`
			`"""`

			`df = self.model.predict(texts, output_level="document")`
			`emb_col = None`
			`for c in df.columns:`
			`if "embedding" in c:`
			`emb_col = c`
			`return [vec.tolist() for vec in df[emb_col].tolist()]`

			`def embed_query(self, text: str) -> List[float]:`
			`"""Compute query embeddings using a JohnSnowLabs transformer model.`

			`Args:`
			`text: The text to embed.`

			`Returns:`
			`Embeddings for the text.`
			`"""`
			`return self.embed_documents([text])[0]`