FEAT: Integrate Xinference LLMs and Embeddings (#8171)
- [Xorbits
Inference(Xinference)](https://github.com/xorbitsai/inference) is a
powerful and versatile library designed to serve language, speech
recognition, and multimodal models. Xinference supports a variety of
GGML-compatible models including chatglm, whisper, and vicuna, and
utilizes heterogeneous hardware and a distributed architecture for
seamless cross-device and cross-server model deployment.
- This PR integrates Xinference models and Xinference embeddings into
LangChain.
- Dependencies: To install the depenedencies for this integration, run
`pip install "xinference[all]"`
- Example Usage:
To start a local instance of Xinference, run `xinference`.
To deploy Xinference in a distributed cluster, first start an Xinference
supervisor using `xinference-supervisor`:
`xinference-supervisor -H "${supervisor_host}"`
Then, start the Xinference workers using `xinference-worker` on each
server you want to run them on.
`xinference-worker -e "http://${supervisor_host}:9997"`
To use Xinference with LangChain, you also need to launch a model. You
can use command line interface (CLI) to do so. Fo example: `xinference
launch -n vicuna-v1.3 -f ggmlv3 -q q4_0`. This launches a model named
vicuna-v1.3 with `model_format="ggmlv3"` and `quantization="q4_0"`. A
model UID is returned for you to use.
Now you can use Xinference with LangChain:
```python
from langchain.llms import Xinference
llm = Xinference(
server_url="http://0.0.0.0:9997", # suppose the supervisor_host is "0.0.0.0"
model_uid = {model_uid} # model UID returned from launching a model
)
llm(
prompt="Q: where can we visit in the capital of France? A:",
generate_config={"max_tokens": 1024},
)
```
You can also use RESTful client to launch a model:
```python
from xinference.client import RESTfulClient
client = RESTfulClient("http://0.0.0.0:9997")
model_uid = client.launch_model(model_name="vicuna-v1.3", model_size_in_billions=7, quantization="q4_0")
```
The following code block demonstrates how to use Xinference embeddings
with LangChain:
```python
from langchain.embeddings import XinferenceEmbeddings
xinference = XinferenceEmbeddings(
server_url="http://0.0.0.0:9997",
model_uid = model_uid
)
```
```python
query_result = xinference.embed_query("This is a test query")
```
```python
doc_result = xinference.embed_documents(["text A", "text B"])
```
Xinference is still under rapid development. Feel free to [join our
Slack
community](https://xorbitsio.slack.com/join/shared_invite/zt-1z3zsm9ep-87yI9YZ_B79HLB2ccTq4WA)
to get the latest updates!
- Request for review: @hwchase17, @baskaryan
- Twitter handle: https://twitter.com/Xorbitsio
---------
Co-authored-by: Bagatur <baskaryan@gmail.com>
2023-07-28 04:23:19 +00:00
|
|
|
"""Test Xinference embeddings."""
|
|
|
|
import time
|
|
|
|
from typing import AsyncGenerator, Tuple
|
|
|
|
|
|
|
|
import pytest_asyncio
|
|
|
|
|
2023-12-11 21:53:30 +00:00
|
|
|
from langchain_community.embeddings import XinferenceEmbeddings
|
FEAT: Integrate Xinference LLMs and Embeddings (#8171)
- [Xorbits
Inference(Xinference)](https://github.com/xorbitsai/inference) is a
powerful and versatile library designed to serve language, speech
recognition, and multimodal models. Xinference supports a variety of
GGML-compatible models including chatglm, whisper, and vicuna, and
utilizes heterogeneous hardware and a distributed architecture for
seamless cross-device and cross-server model deployment.
- This PR integrates Xinference models and Xinference embeddings into
LangChain.
- Dependencies: To install the depenedencies for this integration, run
`pip install "xinference[all]"`
- Example Usage:
To start a local instance of Xinference, run `xinference`.
To deploy Xinference in a distributed cluster, first start an Xinference
supervisor using `xinference-supervisor`:
`xinference-supervisor -H "${supervisor_host}"`
Then, start the Xinference workers using `xinference-worker` on each
server you want to run them on.
`xinference-worker -e "http://${supervisor_host}:9997"`
To use Xinference with LangChain, you also need to launch a model. You
can use command line interface (CLI) to do so. Fo example: `xinference
launch -n vicuna-v1.3 -f ggmlv3 -q q4_0`. This launches a model named
vicuna-v1.3 with `model_format="ggmlv3"` and `quantization="q4_0"`. A
model UID is returned for you to use.
Now you can use Xinference with LangChain:
```python
from langchain.llms import Xinference
llm = Xinference(
server_url="http://0.0.0.0:9997", # suppose the supervisor_host is "0.0.0.0"
model_uid = {model_uid} # model UID returned from launching a model
)
llm(
prompt="Q: where can we visit in the capital of France? A:",
generate_config={"max_tokens": 1024},
)
```
You can also use RESTful client to launch a model:
```python
from xinference.client import RESTfulClient
client = RESTfulClient("http://0.0.0.0:9997")
model_uid = client.launch_model(model_name="vicuna-v1.3", model_size_in_billions=7, quantization="q4_0")
```
The following code block demonstrates how to use Xinference embeddings
with LangChain:
```python
from langchain.embeddings import XinferenceEmbeddings
xinference = XinferenceEmbeddings(
server_url="http://0.0.0.0:9997",
model_uid = model_uid
)
```
```python
query_result = xinference.embed_query("This is a test query")
```
```python
doc_result = xinference.embed_documents(["text A", "text B"])
```
Xinference is still under rapid development. Feel free to [join our
Slack
community](https://xorbitsio.slack.com/join/shared_invite/zt-1z3zsm9ep-87yI9YZ_B79HLB2ccTq4WA)
to get the latest updates!
- Request for review: @hwchase17, @baskaryan
- Twitter handle: https://twitter.com/Xorbitsio
---------
Co-authored-by: Bagatur <baskaryan@gmail.com>
2023-07-28 04:23:19 +00:00
|
|
|
|
|
|
|
|
|
|
|
@pytest_asyncio.fixture
|
|
|
|
async def setup() -> AsyncGenerator[Tuple[str, str], None]:
|
|
|
|
import xoscar as xo
|
|
|
|
from xinference.deploy.supervisor import start_supervisor_components
|
|
|
|
from xinference.deploy.utils import create_worker_actor_pool
|
|
|
|
from xinference.deploy.worker import start_worker_components
|
|
|
|
|
|
|
|
pool = await create_worker_actor_pool(
|
|
|
|
f"test://127.0.0.1:{xo.utils.get_next_port()}"
|
|
|
|
)
|
|
|
|
print(f"Pool running on localhost:{pool.external_address}")
|
|
|
|
|
|
|
|
endpoint = await start_supervisor_components(
|
|
|
|
pool.external_address, "127.0.0.1", xo.utils.get_next_port()
|
|
|
|
)
|
|
|
|
await start_worker_components(
|
|
|
|
address=pool.external_address, supervisor_address=pool.external_address
|
|
|
|
)
|
|
|
|
|
|
|
|
# wait for the api.
|
|
|
|
time.sleep(3)
|
|
|
|
async with pool:
|
|
|
|
yield endpoint, pool.external_address
|
|
|
|
|
|
|
|
|
|
|
|
def test_xinference_embedding_documents(setup: Tuple[str, str]) -> None:
|
|
|
|
"""Test xinference embeddings for documents."""
|
|
|
|
from xinference.client import RESTfulClient
|
|
|
|
|
|
|
|
endpoint, _ = setup
|
|
|
|
|
|
|
|
client = RESTfulClient(endpoint)
|
|
|
|
|
|
|
|
model_uid = client.launch_model(
|
|
|
|
model_name="vicuna-v1.3",
|
|
|
|
model_size_in_billions=7,
|
|
|
|
model_format="ggmlv3",
|
|
|
|
quantization="q4_0",
|
|
|
|
)
|
|
|
|
|
|
|
|
xinference = XinferenceEmbeddings(server_url=endpoint, model_uid=model_uid)
|
|
|
|
|
|
|
|
documents = ["foo bar", "bar foo"]
|
|
|
|
output = xinference.embed_documents(documents)
|
|
|
|
assert len(output) == 2
|
|
|
|
assert len(output[0]) == 4096
|
|
|
|
|
|
|
|
|
|
|
|
def test_xinference_embedding_query(setup: Tuple[str, str]) -> None:
|
|
|
|
"""Test xinference embeddings for query."""
|
|
|
|
from xinference.client import RESTfulClient
|
|
|
|
|
|
|
|
endpoint, _ = setup
|
|
|
|
|
|
|
|
client = RESTfulClient(endpoint)
|
|
|
|
|
|
|
|
model_uid = client.launch_model(
|
|
|
|
model_name="vicuna-v1.3", model_size_in_billions=7, quantization="q4_0"
|
|
|
|
)
|
|
|
|
|
|
|
|
xinference = XinferenceEmbeddings(server_url=endpoint, model_uid=model_uid)
|
|
|
|
|
|
|
|
document = "foo bar"
|
|
|
|
output = xinference.embed_query(document)
|
|
|
|
assert len(output) == 4096
|