forked from Archives/langchain
parent
595ebe1796
commit
d85f57ef9c
@ -0,0 +1,26 @@
|
||||
# Llama.cpp
|
||||
|
||||
This page covers how to use [llama.cpp](https://github.com/ggerganov/llama.cpp) within LangChain.
|
||||
It is broken into two parts: installation and setup, and then references to specific Jina wrappers.
|
||||
|
||||
## Installation and Setup
|
||||
- Install the Python package with `pip install llama-cpp-python`
|
||||
- Download one of the [supported models](https://github.com/ggerganov/llama.cpp#description) and convert them to the llama.cpp format per the [instructions](https://github.com/ggerganov/llama.cpp)
|
||||
|
||||
## Wrappers
|
||||
|
||||
### LLM
|
||||
|
||||
There exists a LlamaCpp LLM wrapper, which you can access with
|
||||
```python
|
||||
from langchain.llms import LlamaCpp
|
||||
```
|
||||
For a more detailed walkthrough of this, see [this notebook](../modules/models/text_embedding/examples/llamacpp.ipynb)
|
||||
|
||||
### Embeddings
|
||||
|
||||
There exists a LlamaCpp Embeddings wrapper, which you can access with
|
||||
```python
|
||||
from langchain.embeddings import LlamaCppEmbeddings
|
||||
```
|
||||
For a more detailed walkthrough of this, see [this notebook](../modules/models/llms/integrations/examples/llamacpp.ipynb)
|
@ -0,0 +1,98 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install llama-cpp-python"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.llms import LlamaCpp\n",
|
||||
"from langchain import PromptTemplate, LLMChain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"template = \"\"\"Question: {question}\n",
|
||||
"\n",
|
||||
"Answer: Let's think step by step.\"\"\"\n",
|
||||
"\n",
|
||||
"prompt = PromptTemplate(template=template, input_variables=[\"question\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm = LlamaCpp(model_path=\"./ggml-model-q4_0.bin\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llm_chain = LLMChain(prompt=prompt, llm=llm)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'\\n\\nWe know that Justin Bieber is currently 25 years old and that he was born on March 1st, 1994 and that he is a singer and he has an album called Purpose, so we know that he was born when Super Bowl XXXVIII was played between Dallas and Seattle and that it took place February 1st, 2004 and that the Seattle Seahawks won 24-21, so Seattle is our answer!'"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"question = \"What NFL team won the Super Bowl in the year Justin Bieber was born?\"\n",
|
||||
"\n",
|
||||
"llm_chain.run(question)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "workspace",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.13"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -0,0 +1,66 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pip install llama-cpp-python"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.embeddings import LlamaCppEmbeddings"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"llama = LlamaCppEmbeddings(model_path=\"/path/to/model/ggml-model-q4_0.bin\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"text = \"This is a test document.\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query_result = embeddings.embed_query(text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"doc_result = embeddings.embed_documents([text])"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"language_info": {
|
||||
"name": "python"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -0,0 +1,118 @@
|
||||
"""Wrapper around llama.cpp embedding models."""
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from pydantic import BaseModel, Extra, Field, root_validator
|
||||
|
||||
from langchain.embeddings.base import Embeddings
|
||||
|
||||
|
||||
class LlamaCppEmbeddings(BaseModel, Embeddings):
|
||||
"""Wrapper around llama.cpp embedding models.
|
||||
|
||||
To use, you should have the llama-cpp-python library installed, and provide the
|
||||
path to the Llama model as a named parameter to the constructor.
|
||||
Check out: https://github.com/abetlen/llama-cpp-python
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.embeddings import LlamaCppEmbeddings
|
||||
llama = LlamaCppEmbeddings(model_path="/path/to/model.bin")
|
||||
"""
|
||||
|
||||
client: Any #: :meta private:
|
||||
model_path: str
|
||||
|
||||
n_ctx: int = Field(512, alias="n_ctx")
|
||||
"""Token context window."""
|
||||
|
||||
n_parts: int = Field(-1, alias="n_parts")
|
||||
"""Number of parts to split the model into.
|
||||
If -1, the number of parts is automatically determined."""
|
||||
|
||||
seed: int = Field(-1, alias="seed")
|
||||
"""Seed. If -1, a random seed is used."""
|
||||
|
||||
f16_kv: bool = Field(False, alias="f16_kv")
|
||||
"""Use half-precision for key/value cache."""
|
||||
|
||||
logits_all: bool = Field(False, alias="logits_all")
|
||||
"""Return logits for all tokens, not just the last token."""
|
||||
|
||||
vocab_only: bool = Field(False, alias="vocab_only")
|
||||
"""Only load the vocabulary, no weights."""
|
||||
|
||||
use_mlock: bool = Field(False, alias="use_mlock")
|
||||
"""Force system to keep model in RAM."""
|
||||
|
||||
n_threads: Optional[int] = Field(None, alias="n_threads")
|
||||
"""Number of threads to use. If None, the number
|
||||
of threads is automatically determined."""
|
||||
|
||||
class Config:
|
||||
"""Configuration for this pydantic object."""
|
||||
|
||||
extra = Extra.forbid
|
||||
|
||||
@root_validator()
|
||||
def validate_environment(cls, values: Dict) -> Dict:
|
||||
"""Validate that llama-cpp-python library is installed."""
|
||||
model_path = values["model_path"]
|
||||
n_ctx = values["n_ctx"]
|
||||
n_parts = values["n_parts"]
|
||||
seed = values["seed"]
|
||||
f16_kv = values["f16_kv"]
|
||||
logits_all = values["logits_all"]
|
||||
vocab_only = values["vocab_only"]
|
||||
use_mlock = values["use_mlock"]
|
||||
n_threads = values["n_threads"]
|
||||
|
||||
try:
|
||||
from llama_cpp import Llama
|
||||
|
||||
values["client"] = Llama(
|
||||
model_path=model_path,
|
||||
n_ctx=n_ctx,
|
||||
n_parts=n_parts,
|
||||
seed=seed,
|
||||
f16_kv=f16_kv,
|
||||
logits_all=logits_all,
|
||||
vocab_only=vocab_only,
|
||||
use_mlock=use_mlock,
|
||||
n_threads=n_threads,
|
||||
embedding=True,
|
||||
)
|
||||
except ImportError:
|
||||
raise ModuleNotFoundError(
|
||||
"Could not import llama-cpp-python library. "
|
||||
"Please install the llama-cpp-python library to "
|
||||
"use this embedding model: pip install llama-cpp-python"
|
||||
)
|
||||
except Exception:
|
||||
raise NameError(f"Could not load Llama model from path: {model_path}")
|
||||
|
||||
return values
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Embed a list of documents using the Llama model.
|
||||
|
||||
Args:
|
||||
texts: The list of texts to embed.
|
||||
|
||||
Returns:
|
||||
List of embeddings, one for each text.
|
||||
"""
|
||||
embeddings = [self.client.embed(text) for text in texts]
|
||||
return [list(map(float, e)) for e in embeddings]
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Embed a query using the Llama model.
|
||||
|
||||
Args:
|
||||
text: The text to embed.
|
||||
|
||||
Returns:
|
||||
Embeddings for the text.
|
||||
"""
|
||||
embedding = self.client.embed(text)
|
||||
return list(map(float, embedding))
|
@ -0,0 +1,184 @@
|
||||
"""Wrapper around llama.cpp."""
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from pydantic import BaseModel, Field, root_validator
|
||||
|
||||
from langchain.llms.base import LLM
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LlamaCpp(LLM, BaseModel):
|
||||
"""Wrapper around the llama.cpp model.
|
||||
|
||||
To use, you should have the llama-cpp-python library installed, and provide the
|
||||
path to the Llama model as a named parameter to the constructor.
|
||||
Check out: https://github.com/abetlen/llama-cpp-python
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.llms import LlamaCppEmbeddings
|
||||
llm = LlamaCppEmbeddings(model_path="/path/to/llama/model")
|
||||
"""
|
||||
|
||||
client: Any #: :meta private:
|
||||
model_path: str
|
||||
"""The path to the Llama model file."""
|
||||
|
||||
n_ctx: int = Field(512, alias="n_ctx")
|
||||
"""Token context window."""
|
||||
|
||||
n_parts: int = Field(-1, alias="n_parts")
|
||||
"""Number of parts to split the model into.
|
||||
If -1, the number of parts is automatically determined."""
|
||||
|
||||
seed: int = Field(-1, alias="seed")
|
||||
"""Seed. If -1, a random seed is used."""
|
||||
|
||||
f16_kv: bool = Field(False, alias="f16_kv")
|
||||
"""Use half-precision for key/value cache."""
|
||||
|
||||
logits_all: bool = Field(False, alias="logits_all")
|
||||
"""Return logits for all tokens, not just the last token."""
|
||||
|
||||
vocab_only: bool = Field(False, alias="vocab_only")
|
||||
"""Only load the vocabulary, no weights."""
|
||||
|
||||
use_mlock: bool = Field(False, alias="use_mlock")
|
||||
"""Force system to keep model in RAM."""
|
||||
|
||||
n_threads: Optional[int] = Field(None, alias="n_threads")
|
||||
"""Number of threads to use.
|
||||
If None, the number of threads is automatically determined."""
|
||||
|
||||
suffix: Optional[str] = Field(None)
|
||||
"""A suffix to append to the generated text. If None, no suffix is appended."""
|
||||
|
||||
max_tokens: Optional[int] = 256
|
||||
"""The maximum number of tokens to generate."""
|
||||
|
||||
temperature: Optional[float] = 0.8
|
||||
"""The temperature to use for sampling."""
|
||||
|
||||
top_p: Optional[float] = 0.95
|
||||
"""The top-p value to use for sampling."""
|
||||
|
||||
logprobs: Optional[int] = Field(None)
|
||||
"""The number of logprobs to return. If None, no logprobs are returned."""
|
||||
|
||||
echo: Optional[bool] = False
|
||||
"""Whether to echo the prompt."""
|
||||
|
||||
stop: Optional[List[str]] = []
|
||||
"""A list of strings to stop generation when encountered."""
|
||||
|
||||
repeat_penalty: Optional[float] = 1.1
|
||||
"""The penalty to apply to repeated tokens."""
|
||||
|
||||
top_k: Optional[int] = 40
|
||||
"""The top-k value to use for sampling."""
|
||||
|
||||
@root_validator()
|
||||
def validate_environment(cls, values: Dict) -> Dict:
|
||||
"""Validate that llama-cpp-python library is installed."""
|
||||
model_path = values["model_path"]
|
||||
n_ctx = values["n_ctx"]
|
||||
n_parts = values["n_parts"]
|
||||
seed = values["seed"]
|
||||
f16_kv = values["f16_kv"]
|
||||
logits_all = values["logits_all"]
|
||||
vocab_only = values["vocab_only"]
|
||||
use_mlock = values["use_mlock"]
|
||||
n_threads = values["n_threads"]
|
||||
|
||||
try:
|
||||
from llama_cpp import Llama
|
||||
|
||||
values["client"] = Llama(
|
||||
model_path=model_path,
|
||||
n_ctx=n_ctx,
|
||||
n_parts=n_parts,
|
||||
seed=seed,
|
||||
f16_kv=f16_kv,
|
||||
logits_all=logits_all,
|
||||
vocab_only=vocab_only,
|
||||
use_mlock=use_mlock,
|
||||
n_threads=n_threads,
|
||||
)
|
||||
except ImportError:
|
||||
raise ModuleNotFoundError(
|
||||
"Could not import llama-cpp-python library. "
|
||||
"Please install the llama-cpp-python library to "
|
||||
"use this embedding model: pip install llama-cpp-python"
|
||||
)
|
||||
except Exception:
|
||||
raise NameError(f"Could not load Llama model from path: {model_path}")
|
||||
|
||||
return values
|
||||
|
||||
@property
|
||||
def _default_params(self) -> Dict[str, Any]:
|
||||
"""Get the default parameters for calling llama_cpp."""
|
||||
return {
|
||||
"suffix": self.suffix,
|
||||
"max_tokens": self.max_tokens,
|
||||
"temperature": self.temperature,
|
||||
"top_p": self.top_p,
|
||||
"logprobs": self.logprobs,
|
||||
"echo": self.echo,
|
||||
"stop_sequences": self.stop,
|
||||
"repeat_penalty": self.repeat_penalty,
|
||||
"top_k": self.top_k,
|
||||
}
|
||||
|
||||
@property
|
||||
def _identifying_params(self) -> Dict[str, Any]:
|
||||
"""Get the identifying parameters."""
|
||||
return {**{"model_path": self.model_path}, **self._default_params}
|
||||
|
||||
@property
|
||||
def _llm_type(self) -> str:
|
||||
"""Return type of llm."""
|
||||
return "llama.cpp"
|
||||
|
||||
def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
|
||||
"""Call the Llama model and return the output.
|
||||
|
||||
Args:
|
||||
prompt: The prompt to use for generation.
|
||||
stop: A list of strings to stop generation when encountered.
|
||||
|
||||
Returns:
|
||||
The generated text.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain.llms import LlamaCppEmbeddings
|
||||
llm = LlamaCppEmbeddings(model_path="/path/to/local/llama/model.bin")
|
||||
llm("This is a prompt.")
|
||||
"""
|
||||
|
||||
params = self._default_params
|
||||
if self.stop and stop is not None:
|
||||
raise ValueError("`stop` found in both the input and default params.")
|
||||
elif self.stop:
|
||||
params["stop_sequences"] = self.stop
|
||||
else:
|
||||
params["stop_sequences"] = []
|
||||
|
||||
"""Call the Llama model and return the output."""
|
||||
text = self.client(
|
||||
prompt=prompt,
|
||||
max_tokens=params["max_tokens"],
|
||||
temperature=params["temperature"],
|
||||
top_p=params["top_p"],
|
||||
logprobs=params["logprobs"],
|
||||
echo=params["echo"],
|
||||
stop=params["stop_sequences"],
|
||||
repeat_penalty=params["repeat_penalty"],
|
||||
top_k=params["top_k"],
|
||||
)
|
||||
return text["choices"][0]["text"]
|
@ -0,0 +1,46 @@
|
||||
# flake8: noqa
|
||||
"""Test llamacpp embeddings."""
|
||||
import os
|
||||
from urllib.request import urlretrieve
|
||||
|
||||
from langchain.embeddings.llamacpp import LlamaCppEmbeddings
|
||||
|
||||
|
||||
def get_model() -> str:
|
||||
"""Download model.
|
||||
From https://huggingface.co/Sosaka/Alpaca-native-4bit-ggml/,
|
||||
convert to new ggml format and return model path.
|
||||
"""
|
||||
model_url = "https://huggingface.co/Sosaka/Alpaca-native-4bit-ggml/resolve/main/ggml-alpaca-7b-q4.bin"
|
||||
tokenizer_url = "https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model"
|
||||
conversion_script = "https://github.com/ggerganov/llama.cpp/raw/master/convert-unversioned-ggml-to-ggml.py"
|
||||
local_filename = model_url.split("/")[-1]
|
||||
|
||||
if not os.path.exists("convert-unversioned-ggml-to-ggml.py"):
|
||||
urlretrieve(conversion_script, "convert-unversioned-ggml-to-ggml.py")
|
||||
if not os.path.exists("tokenizer.model"):
|
||||
urlretrieve(tokenizer_url, "tokenizer.model")
|
||||
if not os.path.exists(local_filename):
|
||||
urlretrieve(model_url, local_filename)
|
||||
os.system("python convert-unversioned-ggml-to-ggml.py . tokenizer.model")
|
||||
|
||||
return local_filename
|
||||
|
||||
|
||||
def test_llamacpp_embedding_documents() -> None:
|
||||
"""Test llamacpp embeddings."""
|
||||
documents = ["foo bar"]
|
||||
model_path = get_model()
|
||||
embedding = LlamaCppEmbeddings(model_path=model_path)
|
||||
output = embedding.embed_documents(documents)
|
||||
assert len(output) == 1
|
||||
assert len(output[0]) == 512
|
||||
|
||||
|
||||
def test_llamacpp_embedding_query() -> None:
|
||||
"""Test llamacpp embeddings."""
|
||||
document = "foo bar"
|
||||
model_path = get_model()
|
||||
embedding = LlamaCppEmbeddings(model_path=model_path)
|
||||
output = embedding.embed_query(document)
|
||||
assert len(output) == 512
|
@ -0,0 +1,34 @@
|
||||
# flake8: noqa
|
||||
"""Test Llama.cpp wrapper."""
|
||||
import os
|
||||
from urllib.request import urlretrieve
|
||||
|
||||
from langchain.llms import LlamaCpp
|
||||
|
||||
|
||||
def get_model() -> str:
|
||||
"""Download model. f
|
||||
From https://huggingface.co/Sosaka/Alpaca-native-4bit-ggml/,
|
||||
convert to new ggml format and return model path."""
|
||||
model_url = "https://huggingface.co/Sosaka/Alpaca-native-4bit-ggml/resolve/main/ggml-alpaca-7b-q4.bin"
|
||||
tokenizer_url = "https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model"
|
||||
conversion_script = "https://github.com/ggerganov/llama.cpp/raw/master/convert-unversioned-ggml-to-ggml.py"
|
||||
local_filename = model_url.split("/")[-1]
|
||||
|
||||
if not os.path.exists("convert-unversioned-ggml-to-ggml.py"):
|
||||
urlretrieve(conversion_script, "convert-unversioned-ggml-to-ggml.py")
|
||||
if not os.path.exists("tokenizer.model"):
|
||||
urlretrieve(tokenizer_url, "tokenizer.model")
|
||||
if not os.path.exists(local_filename):
|
||||
urlretrieve(model_url, local_filename)
|
||||
os.system(f"python convert-unversioned-ggml-to-ggml.py . tokenizer.model")
|
||||
|
||||
return local_filename
|
||||
|
||||
|
||||
def test_llamacpp_inference() -> None:
|
||||
"""Test valid llama.cpp inference."""
|
||||
model_path = get_model()
|
||||
llm = LlamaCpp(model_path=model_path)
|
||||
output = llm("Say foo:")
|
||||
assert isinstance(output, str)
|
Loading…
Reference in New Issue