diff --git a/docs/docs/integrations/providers/johnsnowlabs.mdx b/docs/docs/integrations/providers/johnsnowlabs.mdx new file mode 100644 index 0000000000..39f3ea494c --- /dev/null +++ b/docs/docs/integrations/providers/johnsnowlabs.mdx @@ -0,0 +1,117 @@ +# Johnsnowlabs + +Gain access to the [johnsnowlabs](https://www.johnsnowlabs.com/) ecosystem of enterprise NLP libraries +with over 21.000 enterprise NLP models in over 200 languages with the open source `johnsnowlabs` library. +For all 24.000+ models, see the [John Snow Labs Model Models Hub](https://nlp.johnsnowlabs.com/models) + +## Installation and Setup + + +```bash +pip install johnsnowlabs +``` + +To [install enterprise features](https://nlp.johnsnowlabs.com/docs/en/jsl/install_licensed_quick, run: +```python +# for more details see https://nlp.johnsnowlabs.com/docs/en/jsl/install_licensed_quick +nlp.install() +``` + + +You can embed your queries and documents with either `gpu`,`cpu`,`apple_silicon`,`aarch` based optimized binaries. +By default cpu binaries are used. +Once a session is started, you must restart your notebook to switch between GPU or CPU, or changes will not take effect. + +## Embed Query with CPU: +```python +document = "foo bar" +embedding = JohnSnowLabsEmbeddings('embed_sentence.bert') +output = embedding.embed_query(document) +``` + + +## Embed Query with GPU: + + +```python +document = "foo bar" +embedding = JohnSnowLabsEmbeddings('embed_sentence.bert','gpu') +output = embedding.embed_query(document) +``` + + + + +## Embed Query with Apple Silicon (M1,M2,etc..): + +```python +documents = ["foo bar", 'bar foo'] +embedding = JohnSnowLabsEmbeddings('embed_sentence.bert','apple_silicon') +output = embedding.embed_query(document) +``` + + + +## Embed Query with AARCH: + +```python +documents = ["foo bar", 'bar foo'] +embedding = JohnSnowLabsEmbeddings('embed_sentence.bert','aarch') +output = embedding.embed_query(document) +``` + + + + + + +## Embed Document with CPU: +```python +documents = ["foo bar", 'bar foo'] +embedding = JohnSnowLabsEmbeddings('embed_sentence.bert','gpu') +output = embedding.embed_documents(documents) +``` + + + +## Embed Document with GPU: + +```python +documents = ["foo bar", 'bar foo'] +embedding = JohnSnowLabsEmbeddings('embed_sentence.bert','gpu') +output = embedding.embed_documents(documents) +``` + + + + + +## Embed Document with Apple Silicon (M1,M2,etc..): + +```python + +```python +documents = ["foo bar", 'bar foo'] +embedding = JohnSnowLabsEmbeddings('embed_sentence.bert','apple_silicon') +output = embedding.embed_documents(documents) +``` + + + +## Embed Document with AARCH: + +```python + +```python +documents = ["foo bar", 'bar foo'] +embedding = JohnSnowLabsEmbeddings('embed_sentence.bert','aarch') +output = embedding.embed_documents(documents) +``` + + + + +Models are loaded with [nlp.load](https://nlp.johnsnowlabs.com/docs/en/jsl/load_api) and spark session is started with [nlp.start()](https://nlp.johnsnowlabs.com/docs/en/jsl/start-a-sparksession) under the hood. + + + diff --git a/docs/docs/integrations/text_embedding/johnsnowlabs_embedding.ipynb b/docs/docs/integrations/text_embedding/johnsnowlabs_embedding.ipynb new file mode 100644 index 0000000000..0fdc7ac3b0 --- /dev/null +++ b/docs/docs/integrations/text_embedding/johnsnowlabs_embedding.ipynb @@ -0,0 +1,184 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# Johnsnowlabs Embedding\n", + "\n", + "### Loading the Johnsnowlabs embedding class to generate and query embeddings\n", + "\n", + "Models are loaded with [nlp.load](https://nlp.johnsnowlabs.com/docs/en/jsl/load_api) and spark session is started with [nlp.start()](https://nlp.johnsnowlabs.com/docs/en/jsl/start-a-sparksession) under the hood.\n", + "For all 24.000+ models, see the [John Snow Labs Model Models Hub](https://nlp.johnsnowlabs.com/models)\n" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "! pip install johnsnowlabs\n" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# If you have a enterprise license, you can run this to install enterprise features\n", + "# from johnsnowlabs import nlp\n", + "# nlp.install()" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "source": [ + "#### Import the necessary classes" + ], + "metadata": { + "collapsed": false + }, + "execution_count": 1, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Found existing installation: langchain 0.0.189\n", + "Uninstalling langchain-0.0.189:\n", + " Successfully uninstalled langchain-0.0.189\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "from langchain.embeddings.johnsnowlabs import JohnSnowLabsEmbeddings" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Initialize Johnsnowlabs Embeddings and Spark Session" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "embedder = JohnSnowLabsEmbeddings('en.embed_sentence.biobert.clinical_base_cased')" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Define some example texts . These could be any documents that you want to analyze - for example, news articles, social media posts, or product reviews." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "texts = [\"Cancer is caused by smoking\", \"Antibiotics aren't painkiller\"]" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Generate and print embeddings for the texts . The JohnSnowLabsEmbeddings class generates an embedding for each document, which is a numerical representation of the document's content. These embeddings can be used for various natural language processing tasks, such as document similarity comparison or text classification." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "embeddings = embedder.embed_documents(texts)\n", + "for i, embedding in enumerate(embeddings):\n", + " print(f\"Embedding for document {i+1}: {embedding}\")" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Generate and print an embedding for a single piece of text. You can also generate an embedding for a single piece of text, such as a search query. This can be useful for tasks like information retrieval, where you want to find documents that are similar to a given query." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "query = \"Cancer is caused by smoking\"\n", + "query_embedding = embedder.embed_query(query)\n", + "print(f\"Embedding for query: {query_embedding}\")" + ], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/libs/langchain/langchain/embeddings/__init__.py b/libs/langchain/langchain/embeddings/__init__.py index 47251dbe7e..7c9acdb5e4 100644 --- a/libs/langchain/langchain/embeddings/__init__.py +++ b/libs/langchain/langchain/embeddings/__init__.py @@ -43,6 +43,7 @@ from langchain.embeddings.huggingface import ( from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings from langchain.embeddings.javelin_ai_gateway import JavelinAIGatewayEmbeddings from langchain.embeddings.jina import JinaEmbeddings +from langchain.embeddings.johnsnowlabs import JohnSnowLabsEmbeddings from langchain.embeddings.llamacpp import LlamaCppEmbeddings from langchain.embeddings.localai import LocalAIEmbeddings from langchain.embeddings.minimax import MiniMaxEmbeddings @@ -113,6 +114,7 @@ __all__ = [ "JavelinAIGatewayEmbeddings", "OllamaEmbeddings", "QianfanEmbeddingsEndpoint", + "JohnSnowLabsEmbeddings", ] diff --git a/libs/langchain/langchain/embeddings/johnsnowlabs.py b/libs/langchain/langchain/embeddings/johnsnowlabs.py new file mode 100644 index 0000000000..ead57e5228 --- /dev/null +++ b/libs/langchain/langchain/embeddings/johnsnowlabs.py @@ -0,0 +1,92 @@ +import os +import sys +from typing import Any, List + +from langchain.embeddings.base import Embeddings +from langchain.pydantic_v1 import BaseModel, Extra + + +class JohnSnowLabsEmbeddings(BaseModel, Embeddings): + """JohnSnowLabs embedding models + + To use, you should have the ``johnsnowlabs`` python package installed. + Example: + .. code-block:: python + + from langchain.embeddings.johnsnowlabs import JohnSnowLabsEmbeddings + + embedding = JohnSnowLabsEmbeddings(model='embed_sentence.bert') + output = embedding.embed_query("foo bar") + """ + + model: Any = "embed_sentence.bert" + + def __init__( + self, + model: Any = "embed_sentence.bert", + hardware_target: str = "cpu", + **kwargs: Any + ): + """Initialize the johnsnowlabs model.""" + super().__init__(**kwargs) + # 1) Check imports + try: + from johnsnowlabs import nlp + from nlu.pipe.pipeline import NLUPipeline + except ImportError as exc: + raise ImportError( + "Could not import johnsnowlabs python package. " + "Please install it with `pip install johnsnowlabs`." + ) from exc + + # 2) Start a Spark Session + try: + os.environ["PYSPARK_PYTHON"] = sys.executable + os.environ["PYSPARK_DRIVER_PYTHON"] = sys.executable + nlp.start(hardware_target=hardware_target) + except Exception as exc: + raise Exception("Failure starting Spark Session") from exc + + # 3) Load the model + try: + if isinstance(model, str): + self.model = nlp.load(model) + elif isinstance(model, NLUPipeline): + self.model = model + else: + self.model = nlp.to_nlu_pipe(model) + except Exception as exc: + raise Exception("Failure loading model") from exc + + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Compute doc embeddings using a JohnSnowLabs transformer model. + + Args: + texts: The list of texts to embed. + + Returns: + List of embeddings, one for each text. + """ + + df = self.model.predict(texts, output_level="document") + emb_col = None + for c in df.columns: + if "embedding" in c: + emb_col = c + return [vec.tolist() for vec in df[emb_col].tolist()] + + def embed_query(self, text: str) -> List[float]: + """Compute query embeddings using a JohnSnowLabs transformer model. + + Args: + text: The text to embed. + + Returns: + Embeddings for the text. + """ + return self.embed_documents([text])[0] diff --git a/libs/langchain/tests/integration_tests/embeddings/test_johnsnowlabs.py b/libs/langchain/tests/integration_tests/embeddings/test_johnsnowlabs.py new file mode 100644 index 0000000000..3def60b56e --- /dev/null +++ b/libs/langchain/tests/integration_tests/embeddings/test_johnsnowlabs.py @@ -0,0 +1,20 @@ +"""Test johnsnowlabs embeddings.""" + +from langchain.embeddings.johnsnowlabs import JohnSnowLabsEmbeddings + + +def test_johnsnowlabs_embed_document() -> None: + """Test johnsnowlabs embeddings.""" + documents = ["foo bar", "bar foo"] + embedding = JohnSnowLabsEmbeddings() + output = embedding.embed_documents(documents) + assert len(output) == 2 + assert len(output[0]) == 128 + + +def test_johnsnowlabs_embed_query() -> None: + """Test johnsnowlabs embeddings.""" + document = "foo bar" + embedding = JohnSnowLabsEmbeddings() + output = embedding.embed_query(document) + assert len(output) == 128