diff --git a/docs/ecosystem/modelscope.md b/docs/ecosystem/modelscope.md new file mode 100644 index 00000000..7e6ad73a --- /dev/null +++ b/docs/ecosystem/modelscope.md @@ -0,0 +1,20 @@ +# ModelScope + +This page covers how to use the modelscope ecosystem within LangChain. +It is broken into two parts: installation and setup, and then references to specific modelscope wrappers. + +## Installation and Setup + +* Install the Python SDK with `pip install modelscope` + +## Wrappers + +### Embeddings + +There exists a modelscope Embeddings wrapper, which you can access with + +```python +from langchain.embeddings import ModelScopeEmbeddings +``` + +For a more detailed walkthrough of this, see [this notebook](../modules/models/text_embedding/examples/modelscope_hub.ipynb) diff --git a/docs/modules/models/text_embedding/examples/modelscope_hub.ipynb b/docs/modules/models/text_embedding/examples/modelscope_hub.ipynb new file mode 100644 index 00000000..765d4676 --- /dev/null +++ b/docs/modules/models/text_embedding/examples/modelscope_hub.ipynb @@ -0,0 +1,82 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ModelScope\n", + "\n", + "Let's load the ModelScope Embedding class." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings import ModelScopeEmbeddings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_id = \"damo/nlp_corom_sentence-embedding_english-base\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = ModelScopeEmbeddings(model_id=model_id)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "text = \"This is a test document.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query_result = embeddings.embed_query(text)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "doc_results = embeddings.embed_documents([\"foo\"])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "chatgpt", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.9.15" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/langchain/embeddings/__init__.py b/langchain/embeddings/__init__.py index 5c6ae21f..2ae59266 100644 --- a/langchain/embeddings/__init__.py +++ b/langchain/embeddings/__init__.py @@ -17,6 +17,7 @@ from langchain.embeddings.huggingface import ( from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings from langchain.embeddings.jina import JinaEmbeddings from langchain.embeddings.llamacpp import LlamaCppEmbeddings +from langchain.embeddings.modelscope_hub import ModelScopeEmbeddings from langchain.embeddings.mosaicml import MosaicMLInstructorEmbeddings from langchain.embeddings.openai import OpenAIEmbeddings from langchain.embeddings.sagemaker_endpoint import SagemakerEndpointEmbeddings @@ -38,6 +39,7 @@ __all__ = [ "JinaEmbeddings", "LlamaCppEmbeddings", "HuggingFaceHubEmbeddings", + "ModelScopeEmbeddings", "TensorflowHubEmbeddings", "SagemakerEndpointEmbeddings", "HuggingFaceInstructEmbeddings", diff --git a/langchain/embeddings/modelscope_hub.py b/langchain/embeddings/modelscope_hub.py new file mode 100644 index 00000000..23c0bfc0 --- /dev/null +++ b/langchain/embeddings/modelscope_hub.py @@ -0,0 +1,72 @@ +"""Wrapper around ModelScopeHub embedding models.""" +from typing import Any, List + +from pydantic import BaseModel, Extra + +from langchain.embeddings.base import Embeddings + + +class ModelScopeEmbeddings(BaseModel, Embeddings): + """Wrapper around modelscope_hub embedding models. + + To use, you should have the ``modelscope`` python package installed. + + Example: + .. code-block:: python + + from langchain.embeddings import ModelScopeEmbeddings + model_id = "damo/nlp_corom_sentence-embedding_english-base" + embed = ModelScopeEmbeddings(model_id=model_id) + """ + + embed: Any + model_id: str = "damo/nlp_corom_sentence-embedding_english-base" + """Model name to use.""" + + def __init__(self, **kwargs: Any): + """Initialize the modelscope""" + super().__init__(**kwargs) + try: + from modelscope.pipelines import pipeline + from modelscope.utils.constant import Tasks + + self.embed = pipeline(Tasks.sentence_embedding, model=self.model_id) + + except ImportError as e: + raise ImportError( + "Could not import some python packages." + "Please install it with `pip install modelscope`." + ) from e + + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Compute doc embeddings using a modelscope embedding model. + + Args: + texts: The list of texts to embed. + + Returns: + List of embeddings, one for each text. + """ + texts = list(map(lambda x: x.replace("\n", " "), texts)) + inputs = {"source_sentence": texts} + embeddings = self.embed(input=inputs)["text_embedding"] + return embeddings.tolist() + + def embed_query(self, text: str) -> List[float]: + """Compute query embeddings using a modelscope embedding model. + + Args: + text: The text to embed. + + Returns: + Embeddings for the text. + """ + text = text.replace("\n", " ") + inputs = {"source_sentence": [text]} + embedding = self.embed(input=inputs)["text_embedding"][0] + return embedding.tolist() diff --git a/tests/integration_tests/embeddings/test_modelscope_hub.py b/tests/integration_tests/embeddings/test_modelscope_hub.py new file mode 100644 index 00000000..103568af --- /dev/null +++ b/tests/integration_tests/embeddings/test_modelscope_hub.py @@ -0,0 +1,19 @@ +"""Test modelscope embeddings.""" +from langchain.embeddings.modelscope_hub import ModelScopeEmbeddings + + +def test_modelscope_embedding_documents() -> None: + """Test modelscope embeddings for documents.""" + documents = ["foo bar"] + embedding = ModelScopeEmbeddings() + output = embedding.embed_documents(documents) + assert len(output) == 2 + assert len(output[0]) == 512 + + +def test_modelscope_embedding_query() -> None: + """Test modelscope embeddings for query.""" + document = "foo bar" + embedding = ModelScopeEmbeddings() + output = embedding.embed_query(document) + assert len(output) == 512