from typing import Any, Dict, Iterator, List, Optional from langchain_core.callbacks import CallbackManagerForLLMRun from langchain_core.language_models import LLM from langchain_core.outputs import GenerationChunk from langchain_core.pydantic_v1 import Field, root_validator class ExLlamaV2(LLM): """ExllamaV2 API. - working only with GPTQ models for now. - Lora models are not supported yet. To use, you should have the exllamav2 library installed, and provide the path to the Llama model as a named parameter to the constructor. Check out: Example: .. code-block:: python from langchain_community.llms import Exllamav2 llm = Exllamav2(model_path="/path/to/llama/model") #TODO: - Add loras support - Add support for custom settings - Add support for custom stop sequences """ client: Any model_path: str exllama_cache: Any = None config: Any = None generator: Any = None tokenizer: Any = None # If settings is None, it will be used as the default settings for the model. # All other parameters won't be used. settings: Any = None # Langchain parameters logfunc = print stop_sequences: List[str] = Field("") """Sequences that immediately will stop the generator.""" max_new_tokens: int = Field(150) """Maximum number of tokens to generate.""" streaming: bool = Field(True) """Whether to stream the results, token by token.""" verbose: bool = Field(True) """Whether to print debug information.""" # Generator parameters disallowed_tokens: List[int] = Field(None) """List of tokens to disallow during generation.""" @root_validator() def validate_environment(cls, values: Dict[str, Any]) -> Dict[str, Any]: try: import torch except ImportError as e: raise ImportError( "Unable to import torch, please install with `pip install torch`." ) from e # check if cuda is available if not torch.cuda.is_available(): raise EnvironmentError("CUDA is not available. ExllamaV2 requires CUDA.") try: from exllamav2 import ( ExLlamaV2, ExLlamaV2Cache, ExLlamaV2Config, ExLlamaV2Tokenizer, ) from exllamav2.generator import ( ExLlamaV2BaseGenerator, ExLlamaV2StreamingGenerator, ) except ImportError: raise ImportError( "Could not import exllamav2 library. " "Please install the exllamav2 library with (cuda 12.1 is required)" "example : " "!python -m pip install https://github.com/turboderp/exllamav2/releases/download/v0.0.12/exllamav2-0.0.12+cu121-cp311-cp311-linux_x86_64.whl" ) # Set logging function if verbose or set to empty lambda verbose = values["verbose"] if not verbose: values["logfunc"] = lambda *args, **kwargs: None logfunc = values["logfunc"] if values["settings"]: settings = values["settings"] logfunc(settings.__dict__) else: raise NotImplementedError( "settings is required. Custom settings are not supported yet." ) config = ExLlamaV2Config() config.model_dir = values["model_path"] config.prepare() model = ExLlamaV2(config) exllama_cache = ExLlamaV2Cache(model, lazy=True) model.load_autosplit(exllama_cache) tokenizer = ExLlamaV2Tokenizer(config) if values["streaming"]: generator = ExLlamaV2StreamingGenerator(model, exllama_cache, tokenizer) else: generator = ExLlamaV2BaseGenerator(model, exllama_cache, tokenizer) # Configure the model and generator values["stop_sequences"] = [x.strip().lower() for x in values["stop_sequences"]] setattr(settings, "stop_sequences", values["stop_sequences"]) logfunc(f"stop_sequences {values['stop_sequences']}") disallowed = values.get("disallowed_tokens") if disallowed: settings.disallow_tokens(tokenizer, disallowed) values["client"] = model values["generator"] = generator values["config"] = config values["tokenizer"] = tokenizer values["exllama_cache"] = exllama_cache return values @property def _llm_type(self) -> str: """Return type of llm.""" return "ExLlamaV2" def get_num_tokens(self, text: str) -> int: """Get the number of tokens present in the text.""" return self.generator.tokenizer.num_tokens(text) def _call( self, prompt: str, stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None, **kwargs: Any, ) -> str: generator = self.generator if self.streaming: combined_text_output = "" for chunk in self._stream( prompt=prompt, stop=stop, run_manager=run_manager, kwargs=kwargs ): combined_text_output += str(chunk) return combined_text_output else: output = generator.generate_simple( prompt=prompt, gen_settings=self.settings, num_tokens=self.max_new_tokens, ) # subtract subtext from output output = output[len(prompt) :] return output def _stream( self, prompt: str, stop: Optional[List[str]] = None, run_manager: Optional[CallbackManagerForLLMRun] = None, **kwargs: Any, ) -> Iterator[GenerationChunk]: input_ids = self.tokenizer.encode(prompt) self.generator.warmup() self.generator.set_stop_conditions([]) self.generator.begin_stream(input_ids, self.settings) generated_tokens = 0 while True: chunk, eos, _ = self.generator.stream() generated_tokens += 1 if run_manager: run_manager.on_llm_new_token( token=chunk, verbose=self.verbose, ) yield chunk if eos or generated_tokens == self.max_new_tokens: break return