mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
359 lines
12 KiB
Python
359 lines
12 KiB
Python
|
from __future__ import annotations
|
||
|
|
||
|
import logging
|
||
|
from pathlib import Path
|
||
|
from typing import TYPE_CHECKING, Any, Dict, Iterator, List, Optional, Union
|
||
|
|
||
|
from langchain_core.callbacks import CallbackManagerForLLMRun
|
||
|
from langchain_core.language_models.llms import LLM
|
||
|
from langchain_core.outputs import GenerationChunk
|
||
|
from langchain_core.pydantic_v1 import Field, root_validator
|
||
|
from langchain_core.utils import get_pydantic_field_names
|
||
|
from langchain_core.utils.utils import build_extra_kwargs
|
||
|
|
||
|
if TYPE_CHECKING:
|
||
|
from llama_cpp import LlamaGrammar
|
||
|
|
||
|
logger = logging.getLogger(__name__)
|
||
|
|
||
|
|
||
|
class LlamaCpp(LLM):
|
||
|
"""llama.cpp model.
|
||
|
|
||
|
To use, you should have the llama-cpp-python library installed, and provide the
|
||
|
path to the Llama model as a named parameter to the constructor.
|
||
|
Check out: https://github.com/abetlen/llama-cpp-python
|
||
|
|
||
|
Example:
|
||
|
.. code-block:: python
|
||
|
|
||
|
from langchain_community.llms import LlamaCpp
|
||
|
llm = LlamaCpp(model_path="/path/to/llama/model")
|
||
|
"""
|
||
|
|
||
|
client: Any #: :meta private:
|
||
|
model_path: str
|
||
|
"""The path to the Llama model file."""
|
||
|
|
||
|
lora_base: Optional[str] = None
|
||
|
"""The path to the Llama LoRA base model."""
|
||
|
|
||
|
lora_path: Optional[str] = None
|
||
|
"""The path to the Llama LoRA. If None, no LoRa is loaded."""
|
||
|
|
||
|
n_ctx: int = Field(512, alias="n_ctx")
|
||
|
"""Token context window."""
|
||
|
|
||
|
n_parts: int = Field(-1, alias="n_parts")
|
||
|
"""Number of parts to split the model into.
|
||
|
If -1, the number of parts is automatically determined."""
|
||
|
|
||
|
seed: int = Field(-1, alias="seed")
|
||
|
"""Seed. If -1, a random seed is used."""
|
||
|
|
||
|
f16_kv: bool = Field(True, alias="f16_kv")
|
||
|
"""Use half-precision for key/value cache."""
|
||
|
|
||
|
logits_all: bool = Field(False, alias="logits_all")
|
||
|
"""Return logits for all tokens, not just the last token."""
|
||
|
|
||
|
vocab_only: bool = Field(False, alias="vocab_only")
|
||
|
"""Only load the vocabulary, no weights."""
|
||
|
|
||
|
use_mlock: bool = Field(False, alias="use_mlock")
|
||
|
"""Force system to keep model in RAM."""
|
||
|
|
||
|
n_threads: Optional[int] = Field(None, alias="n_threads")
|
||
|
"""Number of threads to use.
|
||
|
If None, the number of threads is automatically determined."""
|
||
|
|
||
|
n_batch: Optional[int] = Field(8, alias="n_batch")
|
||
|
"""Number of tokens to process in parallel.
|
||
|
Should be a number between 1 and n_ctx."""
|
||
|
|
||
|
n_gpu_layers: Optional[int] = Field(None, alias="n_gpu_layers")
|
||
|
"""Number of layers to be loaded into gpu memory. Default None."""
|
||
|
|
||
|
suffix: Optional[str] = Field(None)
|
||
|
"""A suffix to append to the generated text. If None, no suffix is appended."""
|
||
|
|
||
|
max_tokens: Optional[int] = 256
|
||
|
"""The maximum number of tokens to generate."""
|
||
|
|
||
|
temperature: Optional[float] = 0.8
|
||
|
"""The temperature to use for sampling."""
|
||
|
|
||
|
top_p: Optional[float] = 0.95
|
||
|
"""The top-p value to use for sampling."""
|
||
|
|
||
|
logprobs: Optional[int] = Field(None)
|
||
|
"""The number of logprobs to return. If None, no logprobs are returned."""
|
||
|
|
||
|
echo: Optional[bool] = False
|
||
|
"""Whether to echo the prompt."""
|
||
|
|
||
|
stop: Optional[List[str]] = []
|
||
|
"""A list of strings to stop generation when encountered."""
|
||
|
|
||
|
repeat_penalty: Optional[float] = 1.1
|
||
|
"""The penalty to apply to repeated tokens."""
|
||
|
|
||
|
top_k: Optional[int] = 40
|
||
|
"""The top-k value to use for sampling."""
|
||
|
|
||
|
last_n_tokens_size: Optional[int] = 64
|
||
|
"""The number of tokens to look back when applying the repeat_penalty."""
|
||
|
|
||
|
use_mmap: Optional[bool] = True
|
||
|
"""Whether to keep the model loaded in RAM"""
|
||
|
|
||
|
rope_freq_scale: float = 1.0
|
||
|
"""Scale factor for rope sampling."""
|
||
|
|
||
|
rope_freq_base: float = 10000.0
|
||
|
"""Base frequency for rope sampling."""
|
||
|
|
||
|
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
|
||
|
"""Any additional parameters to pass to llama_cpp.Llama."""
|
||
|
|
||
|
streaming: bool = True
|
||
|
"""Whether to stream the results, token by token."""
|
||
|
|
||
|
grammar_path: Optional[Union[str, Path]] = None
|
||
|
"""
|
||
|
grammar_path: Path to the .gbnf file that defines formal grammars
|
||
|
for constraining model outputs. For instance, the grammar can be used
|
||
|
to force the model to generate valid JSON or to speak exclusively in emojis. At most
|
||
|
one of grammar_path and grammar should be passed in.
|
||
|
"""
|
||
|
grammar: Optional[Union[str, LlamaGrammar]] = None
|
||
|
"""
|
||
|
grammar: formal grammar for constraining model outputs. For instance, the grammar
|
||
|
can be used to force the model to generate valid JSON or to speak exclusively in
|
||
|
emojis. At most one of grammar_path and grammar should be passed in.
|
||
|
"""
|
||
|
|
||
|
verbose: bool = True
|
||
|
"""Print verbose output to stderr."""
|
||
|
|
||
|
@root_validator()
|
||
|
def validate_environment(cls, values: Dict) -> Dict:
|
||
|
"""Validate that llama-cpp-python library is installed."""
|
||
|
try:
|
||
|
from llama_cpp import Llama, LlamaGrammar
|
||
|
except ImportError:
|
||
|
raise ImportError(
|
||
|
"Could not import llama-cpp-python library. "
|
||
|
"Please install the llama-cpp-python library to "
|
||
|
"use this embedding model: pip install llama-cpp-python"
|
||
|
)
|
||
|
|
||
|
model_path = values["model_path"]
|
||
|
model_param_names = [
|
||
|
"rope_freq_scale",
|
||
|
"rope_freq_base",
|
||
|
"lora_path",
|
||
|
"lora_base",
|
||
|
"n_ctx",
|
||
|
"n_parts",
|
||
|
"seed",
|
||
|
"f16_kv",
|
||
|
"logits_all",
|
||
|
"vocab_only",
|
||
|
"use_mlock",
|
||
|
"n_threads",
|
||
|
"n_batch",
|
||
|
"use_mmap",
|
||
|
"last_n_tokens_size",
|
||
|
"verbose",
|
||
|
]
|
||
|
model_params = {k: values[k] for k in model_param_names}
|
||
|
# For backwards compatibility, only include if non-null.
|
||
|
if values["n_gpu_layers"] is not None:
|
||
|
model_params["n_gpu_layers"] = values["n_gpu_layers"]
|
||
|
|
||
|
model_params.update(values["model_kwargs"])
|
||
|
|
||
|
try:
|
||
|
values["client"] = Llama(model_path, **model_params)
|
||
|
except Exception as e:
|
||
|
raise ValueError(
|
||
|
f"Could not load Llama model from path: {model_path}. "
|
||
|
f"Received error {e}"
|
||
|
)
|
||
|
|
||
|
if values["grammar"] and values["grammar_path"]:
|
||
|
grammar = values["grammar"]
|
||
|
grammar_path = values["grammar_path"]
|
||
|
raise ValueError(
|
||
|
"Can only pass in one of grammar and grammar_path. Received "
|
||
|
f"{grammar=} and {grammar_path=}."
|
||
|
)
|
||
|
elif isinstance(values["grammar"], str):
|
||
|
values["grammar"] = LlamaGrammar.from_string(values["grammar"])
|
||
|
elif values["grammar_path"]:
|
||
|
values["grammar"] = LlamaGrammar.from_file(values["grammar_path"])
|
||
|
else:
|
||
|
pass
|
||
|
return values
|
||
|
|
||
|
@root_validator(pre=True)
|
||
|
def build_model_kwargs(cls, values: Dict[str, Any]) -> Dict[str, Any]:
|
||
|
"""Build extra kwargs from additional params that were passed in."""
|
||
|
all_required_field_names = get_pydantic_field_names(cls)
|
||
|
extra = values.get("model_kwargs", {})
|
||
|
values["model_kwargs"] = build_extra_kwargs(
|
||
|
extra, values, all_required_field_names
|
||
|
)
|
||
|
return values
|
||
|
|
||
|
@property
|
||
|
def _default_params(self) -> Dict[str, Any]:
|
||
|
"""Get the default parameters for calling llama_cpp."""
|
||
|
params = {
|
||
|
"suffix": self.suffix,
|
||
|
"max_tokens": self.max_tokens,
|
||
|
"temperature": self.temperature,
|
||
|
"top_p": self.top_p,
|
||
|
"logprobs": self.logprobs,
|
||
|
"echo": self.echo,
|
||
|
"stop_sequences": self.stop, # key here is convention among LLM classes
|
||
|
"repeat_penalty": self.repeat_penalty,
|
||
|
"top_k": self.top_k,
|
||
|
}
|
||
|
if self.grammar:
|
||
|
params["grammar"] = self.grammar
|
||
|
return params
|
||
|
|
||
|
@property
|
||
|
def _identifying_params(self) -> Dict[str, Any]:
|
||
|
"""Get the identifying parameters."""
|
||
|
return {**{"model_path": self.model_path}, **self._default_params}
|
||
|
|
||
|
@property
|
||
|
def _llm_type(self) -> str:
|
||
|
"""Return type of llm."""
|
||
|
return "llamacpp"
|
||
|
|
||
|
def _get_parameters(self, stop: Optional[List[str]] = None) -> Dict[str, Any]:
|
||
|
"""
|
||
|
Performs sanity check, preparing parameters in format needed by llama_cpp.
|
||
|
|
||
|
Args:
|
||
|
stop (Optional[List[str]]): List of stop sequences for llama_cpp.
|
||
|
|
||
|
Returns:
|
||
|
Dictionary containing the combined parameters.
|
||
|
"""
|
||
|
|
||
|
# Raise error if stop sequences are in both input and default params
|
||
|
if self.stop and stop is not None:
|
||
|
raise ValueError("`stop` found in both the input and default params.")
|
||
|
|
||
|
params = self._default_params
|
||
|
|
||
|
# llama_cpp expects the "stop" key not this, so we remove it:
|
||
|
params.pop("stop_sequences")
|
||
|
|
||
|
# then sets it as configured, or default to an empty list:
|
||
|
params["stop"] = self.stop or stop or []
|
||
|
|
||
|
return params
|
||
|
|
||
|
def _call(
|
||
|
self,
|
||
|
prompt: str,
|
||
|
stop: Optional[List[str]] = None,
|
||
|
run_manager: Optional[CallbackManagerForLLMRun] = None,
|
||
|
**kwargs: Any,
|
||
|
) -> str:
|
||
|
"""Call the Llama model and return the output.
|
||
|
|
||
|
Args:
|
||
|
prompt: The prompt to use for generation.
|
||
|
stop: A list of strings to stop generation when encountered.
|
||
|
|
||
|
Returns:
|
||
|
The generated text.
|
||
|
|
||
|
Example:
|
||
|
.. code-block:: python
|
||
|
|
||
|
from langchain_community.llms import LlamaCpp
|
||
|
llm = LlamaCpp(model_path="/path/to/local/llama/model.bin")
|
||
|
llm("This is a prompt.")
|
||
|
"""
|
||
|
if self.streaming:
|
||
|
# If streaming is enabled, we use the stream
|
||
|
# method that yields as they are generated
|
||
|
# and return the combined strings from the first choices's text:
|
||
|
combined_text_output = ""
|
||
|
for chunk in self._stream(
|
||
|
prompt=prompt,
|
||
|
stop=stop,
|
||
|
run_manager=run_manager,
|
||
|
**kwargs,
|
||
|
):
|
||
|
combined_text_output += chunk.text
|
||
|
return combined_text_output
|
||
|
else:
|
||
|
params = self._get_parameters(stop)
|
||
|
params = {**params, **kwargs}
|
||
|
result = self.client(prompt=prompt, **params)
|
||
|
return result["choices"][0]["text"]
|
||
|
|
||
|
def _stream(
|
||
|
self,
|
||
|
prompt: str,
|
||
|
stop: Optional[List[str]] = None,
|
||
|
run_manager: Optional[CallbackManagerForLLMRun] = None,
|
||
|
**kwargs: Any,
|
||
|
) -> Iterator[GenerationChunk]:
|
||
|
"""Yields results objects as they are generated in real time.
|
||
|
|
||
|
It also calls the callback manager's on_llm_new_token event with
|
||
|
similar parameters to the OpenAI LLM class method of the same name.
|
||
|
|
||
|
Args:
|
||
|
prompt: The prompts to pass into the model.
|
||
|
stop: Optional list of stop words to use when generating.
|
||
|
|
||
|
Returns:
|
||
|
A generator representing the stream of tokens being generated.
|
||
|
|
||
|
Yields:
|
||
|
A dictionary like objects containing a string token and metadata.
|
||
|
See llama-cpp-python docs and below for more.
|
||
|
|
||
|
Example:
|
||
|
.. code-block:: python
|
||
|
|
||
|
from langchain_community.llms import LlamaCpp
|
||
|
llm = LlamaCpp(
|
||
|
model_path="/path/to/local/model.bin",
|
||
|
temperature = 0.5
|
||
|
)
|
||
|
for chunk in llm.stream("Ask 'Hi, how are you?' like a pirate:'",
|
||
|
stop=["'","\n"]):
|
||
|
result = chunk["choices"][0]
|
||
|
print(result["text"], end='', flush=True)
|
||
|
|
||
|
"""
|
||
|
params = {**self._get_parameters(stop), **kwargs}
|
||
|
result = self.client(prompt=prompt, stream=True, **params)
|
||
|
for part in result:
|
||
|
logprobs = part["choices"][0].get("logprobs", None)
|
||
|
chunk = GenerationChunk(
|
||
|
text=part["choices"][0]["text"],
|
||
|
generation_info={"logprobs": logprobs},
|
||
|
)
|
||
|
yield chunk
|
||
|
if run_manager:
|
||
|
run_manager.on_llm_new_token(
|
||
|
token=chunk.text, verbose=self.verbose, log_probs=logprobs
|
||
|
)
|
||
|
|
||
|
def get_num_tokens(self, text: str) -> int:
|
||
|
tokenized_text = self.client.tokenize(text.encode("utf-8"))
|
||
|
return len(tokenized_text)
|