Harrison/llama (#2314)

Co-authored-by: RJ Adriaansen <adriaansen@eshcc.eur.nl>
1 year ago · d85f57ef9c
parent 595ebe1796
commit d85f57ef9c
11 changed files with 582 additions and 0 deletions
--- a/docs/ecosystem/llamacpp.md
+++ b/docs/ecosystem/llamacpp.md
@ -0,0 +1,26 @@
+# Llama.cpp
+
+This page covers how to use [llama.cpp](https://github.com/ggerganov/llama.cpp) within LangChain.
+It is broken into two parts: installation and setup, and then references to specific Jina wrappers.
+
+## Installation and Setup
+- Install the Python package with `pip install llama-cpp-python`
+- Download one of the [supported models](https://github.com/ggerganov/llama.cpp#description) and convert them to the llama.cpp format per the [instructions](https://github.com/ggerganov/llama.cpp)
+
+## Wrappers
+
+### LLM
+
+There exists a LlamaCpp LLM wrapper, which you can access with 
+```python
+from langchain.llms import LlamaCpp
+```
+For a more detailed walkthrough of this, see [this notebook](../modules/models/text_embedding/examples/llamacpp.ipynb)
+
+### Embeddings
+
+There exists a LlamaCpp Embeddings wrapper, which you can access with 
+```python
+from langchain.embeddings import LlamaCppEmbeddings
+```
+For a more detailed walkthrough of this, see [this notebook](../modules/models/llms/integrations/examples/llamacpp.ipynb)
--- a/docs/modules/models/llms/integrations/llamacpp.ipynb
+++ b/docs/modules/models/llms/integrations/llamacpp.ipynb
@ -0,0 +1,98 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install llama-cpp-python"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.llms import LlamaCpp\n",
+    "from langchain import PromptTemplate, LLMChain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template = \"\"\"Question: {question}\n",
+    "\n",
+    "Answer: Let's think step by step.\"\"\"\n",
+    "\n",
+    "prompt = PromptTemplate(template=template, input_variables=[\"question\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm = LlamaCpp(model_path=\"./ggml-model-q4_0.bin\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm_chain = LLMChain(prompt=prompt, llm=llm)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'\\n\\nWe know that Justin Bieber is currently 25 years old and that he was born on March 1st, 1994 and that he is a singer and he has an album called Purpose, so we know that he was born when Super Bowl XXXVIII was played between Dallas and Seattle and that it took place February 1st, 2004 and that the Seattle Seahawks won 24-21, so Seattle is our answer!'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "question = \"What NFL team won the Super Bowl in the year Justin Bieber was born?\"\n",
+    "\n",
+    "llm_chain.run(question)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "workspace",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/docs/modules/models/text_embedding/examples/llamacpp.ipynb
+++ b/docs/modules/models/text_embedding/examples/llamacpp.ipynb
@ -0,0 +1,66 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install llama-cpp-python"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.embeddings import LlamaCppEmbeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llama = LlamaCppEmbeddings(model_path=\"/path/to/model/ggml-model-q4_0.bin\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"This is a test document.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_result = embeddings.embed_query(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "doc_result = embeddings.embed_documents([text])"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/docs/reference/integrations.md
+++ b/docs/reference/integrations.md
@ -52,6 +52,9 @@ The following use cases require specific installs and api keys:
  - If you want to set up OpenSearch on your local, [here](https://opensearch.org/docs/latest/)
 - _DeepLake_:
  - Install requirements with `pip install deeplake`
+- _LlamaCpp_:
+  - Install requirements with `pip install llama-cpp-python`
+  - Download model and convert following [llama.cpp instructions](https://github.com/ggerganov/llama.cpp)


 If you are using the `NLTKTextSplitter` or the `SpacyTextSplitter`, you will also need to install the appropriate models. For example, if you want to use the `SpacyTextSplitter`, you will need to install the `en_core_web_sm` model with `python -m spacy download en_core_web_sm`. Similarly, if you want to use the `NLTKTextSplitter`, you will need to install the `punkt` model with `python -m nltk.downloader punkt`.
--- a/langchain/init.py
+++ b/langchain/init.py
@ -31,6 +31,7 @@ from langchain.llms import (
    ForefrontAI,
    GooseAI,
    HuggingFaceHub,
+    LlamaCpp,
    Modal,
    OpenAI,
    Petals,
@ -110,4 +111,5 @@ __all__ = [
    "PALChain",
    "set_handler",
    "set_tracing_callback_manager",
+    "LlamaCpp",
 ]
--- a/langchain/embeddings/init.py
+++ b/langchain/embeddings/init.py
@ -14,6 +14,7 @@ from langchain.embeddings.huggingface import (
 )
 from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
 from langchain.embeddings.jina import JinaEmbeddings
+from langchain.embeddings.llamacpp import LlamaCppEmbeddings
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.embeddings.sagemaker_endpoint import SagemakerEndpointEmbeddings
 from langchain.embeddings.self_hosted import SelfHostedEmbeddings
@ -30,6 +31,7 @@ __all__ = [
    "HuggingFaceEmbeddings",
    "CohereEmbeddings",
    "JinaEmbeddings",
+    "LlamaCppEmbeddings",
    "HuggingFaceHubEmbeddings",
    "TensorflowHubEmbeddings",
    "SagemakerEndpointEmbeddings",
--- a/langchain/embeddings/llamacpp.py
+++ b/langchain/embeddings/llamacpp.py
@ -0,0 +1,118 @@
+"""Wrapper around llama.cpp embedding models."""
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, Extra, Field, root_validator
+
+from langchain.embeddings.base import Embeddings
+
+
+class LlamaCppEmbeddings(BaseModel, Embeddings):
+    """Wrapper around llama.cpp embedding models.
+
+    To use, you should have the llama-cpp-python library installed, and provide the
+    path to the Llama model as a named parameter to the constructor.
+    Check out: https://github.com/abetlen/llama-cpp-python
+
+    Example:
+        .. code-block:: python
+
+            from langchain.embeddings import LlamaCppEmbeddings
+            llama = LlamaCppEmbeddings(model_path="/path/to/model.bin")
+    """
+
+    client: Any  #: :meta private:
+    model_path: str
+
+    n_ctx: int = Field(512, alias="n_ctx")
+    """Token context window."""
+
+    n_parts: int = Field(-1, alias="n_parts")
+    """Number of parts to split the model into. 
+    If -1, the number of parts is automatically determined."""
+
+    seed: int = Field(-1, alias="seed")
+    """Seed. If -1, a random seed is used."""
+
+    f16_kv: bool = Field(False, alias="f16_kv")
+    """Use half-precision for key/value cache."""
+
+    logits_all: bool = Field(False, alias="logits_all")
+    """Return logits for all tokens, not just the last token."""
+
+    vocab_only: bool = Field(False, alias="vocab_only")
+    """Only load the vocabulary, no weights."""
+
+    use_mlock: bool = Field(False, alias="use_mlock")
+    """Force system to keep model in RAM."""
+
+    n_threads: Optional[int] = Field(None, alias="n_threads")
+    """Number of threads to use. If None, the number 
+    of threads is automatically determined."""
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.forbid
+
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that llama-cpp-python library is installed."""
+        model_path = values["model_path"]
+        n_ctx = values["n_ctx"]
+        n_parts = values["n_parts"]
+        seed = values["seed"]
+        f16_kv = values["f16_kv"]
+        logits_all = values["logits_all"]
+        vocab_only = values["vocab_only"]
+        use_mlock = values["use_mlock"]
+        n_threads = values["n_threads"]
+
+        try:
+            from llama_cpp import Llama
+
+            values["client"] = Llama(
+                model_path=model_path,
+                n_ctx=n_ctx,
+                n_parts=n_parts,
+                seed=seed,
+                f16_kv=f16_kv,
+                logits_all=logits_all,
+                vocab_only=vocab_only,
+                use_mlock=use_mlock,
+                n_threads=n_threads,
+                embedding=True,
+            )
+        except ImportError:
+            raise ModuleNotFoundError(
+                "Could not import llama-cpp-python library. "
+                "Please install the llama-cpp-python library to "
+                "use this embedding model: pip install llama-cpp-python"
+            )
+        except Exception:
+            raise NameError(f"Could not load Llama model from path: {model_path}")
+
+        return values
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed a list of documents using the Llama model.
+
+        Args:
+            texts: The list of texts to embed.
+
+        Returns:
+            List of embeddings, one for each text.
+        """
+        embeddings = [self.client.embed(text) for text in texts]
+        return [list(map(float, e)) for e in embeddings]
+
+    def embed_query(self, text: str) -> List[float]:
+        """Embed a query using the Llama model.
+
+        Args:
+            text: The text to embed.
+
+        Returns:
+            Embeddings for the text.
+        """
+        embedding = self.client.embed(text)
+        return list(map(float, embedding))
--- a/langchain/llms/init.py
+++ b/langchain/llms/init.py
@ -14,6 +14,7 @@ from langchain.llms.gooseai import GooseAI
 from langchain.llms.huggingface_endpoint import HuggingFaceEndpoint
 from langchain.llms.huggingface_hub import HuggingFaceHub
 from langchain.llms.huggingface_pipeline import HuggingFacePipeline
+from langchain.llms.llamacpp import LlamaCpp
 from langchain.llms.modal import Modal
 from langchain.llms.nlpcloud import NLPCloud
 from langchain.llms.openai import AzureOpenAI, OpenAI, OpenAIChat
@ -35,6 +36,7 @@ __all__ = [
    "DeepInfra",
    "ForefrontAI",
    "GooseAI",
+    "LlamaCpp",
    "Modal",
    "NLPCloud",
    "OpenAI",
@ -67,6 +69,7 @@ type_to_cls_dict: Dict[str, Type[BaseLLM]] = {
    "gooseai": GooseAI,
    "huggingface_hub": HuggingFaceHub,
    "huggingface_endpoint": HuggingFaceEndpoint,
+    "llamacpp": LlamaCpp,
    "modal": Modal,
    "sagemaker_endpoint": SagemakerEndpoint,
    "nlpcloud": NLPCloud,
--- a/langchain/llms/llamacpp.py
+++ b/langchain/llms/llamacpp.py
@ -0,0 +1,184 @@
+"""Wrapper around llama.cpp."""
+import logging
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, Field, root_validator
+
+from langchain.llms.base import LLM
+
+logger = logging.getLogger(__name__)
+
+
+class LlamaCpp(LLM, BaseModel):
+    """Wrapper around the llama.cpp model.
+
+    To use, you should have the llama-cpp-python library installed, and provide the
+    path to the Llama model as a named parameter to the constructor.
+    Check out: https://github.com/abetlen/llama-cpp-python
+
+    Example:
+        .. code-block:: python
+
+            from langchain.llms import LlamaCppEmbeddings
+            llm = LlamaCppEmbeddings(model_path="/path/to/llama/model")
+    """
+
+    client: Any  #: :meta private:
+    model_path: str
+    """The path to the Llama model file."""
+
+    n_ctx: int = Field(512, alias="n_ctx")
+    """Token context window."""
+
+    n_parts: int = Field(-1, alias="n_parts")
+    """Number of parts to split the model into. 
+    If -1, the number of parts is automatically determined."""
+
+    seed: int = Field(-1, alias="seed")
+    """Seed. If -1, a random seed is used."""
+
+    f16_kv: bool = Field(False, alias="f16_kv")
+    """Use half-precision for key/value cache."""
+
+    logits_all: bool = Field(False, alias="logits_all")
+    """Return logits for all tokens, not just the last token."""
+
+    vocab_only: bool = Field(False, alias="vocab_only")
+    """Only load the vocabulary, no weights."""
+
+    use_mlock: bool = Field(False, alias="use_mlock")
+    """Force system to keep model in RAM."""
+
+    n_threads: Optional[int] = Field(None, alias="n_threads")
+    """Number of threads to use. 
+    If None, the number of threads is automatically determined."""
+
+    suffix: Optional[str] = Field(None)
+    """A suffix to append to the generated text. If None, no suffix is appended."""
+
+    max_tokens: Optional[int] = 256
+    """The maximum number of tokens to generate."""
+
+    temperature: Optional[float] = 0.8
+    """The temperature to use for sampling."""
+
+    top_p: Optional[float] = 0.95
+    """The top-p value to use for sampling."""
+
+    logprobs: Optional[int] = Field(None)
+    """The number of logprobs to return. If None, no logprobs are returned."""
+
+    echo: Optional[bool] = False
+    """Whether to echo the prompt."""
+
+    stop: Optional[List[str]] = []
+    """A list of strings to stop generation when encountered."""
+
+    repeat_penalty: Optional[float] = 1.1
+    """The penalty to apply to repeated tokens."""
+
+    top_k: Optional[int] = 40
+    """The top-k value to use for sampling."""
+
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that llama-cpp-python library is installed."""
+        model_path = values["model_path"]
+        n_ctx = values["n_ctx"]
+        n_parts = values["n_parts"]
+        seed = values["seed"]
+        f16_kv = values["f16_kv"]
+        logits_all = values["logits_all"]
+        vocab_only = values["vocab_only"]
+        use_mlock = values["use_mlock"]
+        n_threads = values["n_threads"]
+
+        try:
+            from llama_cpp import Llama
+
+            values["client"] = Llama(
+                model_path=model_path,
+                n_ctx=n_ctx,
+                n_parts=n_parts,
+                seed=seed,
+                f16_kv=f16_kv,
+                logits_all=logits_all,
+                vocab_only=vocab_only,
+                use_mlock=use_mlock,
+                n_threads=n_threads,
+            )
+        except ImportError:
+            raise ModuleNotFoundError(
+                "Could not import llama-cpp-python library. "
+                "Please install the llama-cpp-python library to "
+                "use this embedding model: pip install llama-cpp-python"
+            )
+        except Exception:
+            raise NameError(f"Could not load Llama model from path: {model_path}")
+
+        return values
+
+    @property
+    def _default_params(self) -> Dict[str, Any]:
+        """Get the default parameters for calling llama_cpp."""
+        return {
+            "suffix": self.suffix,
+            "max_tokens": self.max_tokens,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "logprobs": self.logprobs,
+            "echo": self.echo,
+            "stop_sequences": self.stop,
+            "repeat_penalty": self.repeat_penalty,
+            "top_k": self.top_k,
+        }
+
+    @property
+    def _identifying_params(self) -> Dict[str, Any]:
+        """Get the identifying parameters."""
+        return {**{"model_path": self.model_path}, **self._default_params}
+
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "llama.cpp"
+
+    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+        """Call the Llama model and return the output.
+
+        Args:
+            prompt: The prompt to use for generation.
+            stop: A list of strings to stop generation when encountered.
+
+        Returns:
+            The generated text.
+
+        Example:
+            .. code-block:: python
+
+                from langchain.llms import LlamaCppEmbeddings
+                llm = LlamaCppEmbeddings(model_path="/path/to/local/llama/model.bin")
+                llm("This is a prompt.")
+        """
+
+        params = self._default_params
+        if self.stop and stop is not None:
+            raise ValueError("`stop` found in both the input and default params.")
+        elif self.stop:
+            params["stop_sequences"] = self.stop
+        else:
+            params["stop_sequences"] = []
+
+        """Call the Llama model and return the output."""
+        text = self.client(
+            prompt=prompt,
+            max_tokens=params["max_tokens"],
+            temperature=params["temperature"],
+            top_p=params["top_p"],
+            logprobs=params["logprobs"],
+            echo=params["echo"],
+            stop=params["stop_sequences"],
+            repeat_penalty=params["repeat_penalty"],
+            top_k=params["top_k"],
+        )
+        return text["choices"][0]["text"]
--- a/tests/integration_tests/embeddings/test_llamacpp.py
+++ b/tests/integration_tests/embeddings/test_llamacpp.py
@ -0,0 +1,46 @@
+# flake8: noqa
+"""Test llamacpp embeddings."""
+import os
+from urllib.request import urlretrieve
+
+from langchain.embeddings.llamacpp import LlamaCppEmbeddings
+
+
+def get_model() -> str:
+    """Download model.
+    From https://huggingface.co/Sosaka/Alpaca-native-4bit-ggml/,
+    convert to new ggml format and return model path.
+    """
+    model_url = "https://huggingface.co/Sosaka/Alpaca-native-4bit-ggml/resolve/main/ggml-alpaca-7b-q4.bin"
+    tokenizer_url = "https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model"
+    conversion_script = "https://github.com/ggerganov/llama.cpp/raw/master/convert-unversioned-ggml-to-ggml.py"
+    local_filename = model_url.split("/")[-1]
+
+    if not os.path.exists("convert-unversioned-ggml-to-ggml.py"):
+        urlretrieve(conversion_script, "convert-unversioned-ggml-to-ggml.py")
+    if not os.path.exists("tokenizer.model"):
+        urlretrieve(tokenizer_url, "tokenizer.model")
+    if not os.path.exists(local_filename):
+        urlretrieve(model_url, local_filename)
+        os.system("python convert-unversioned-ggml-to-ggml.py . tokenizer.model")
+
+    return local_filename
+
+
+def test_llamacpp_embedding_documents() -> None:
+    """Test llamacpp embeddings."""
+    documents = ["foo bar"]
+    model_path = get_model()
+    embedding = LlamaCppEmbeddings(model_path=model_path)
+    output = embedding.embed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) == 512
+
+
+def test_llamacpp_embedding_query() -> None:
+    """Test llamacpp embeddings."""
+    document = "foo bar"
+    model_path = get_model()
+    embedding = LlamaCppEmbeddings(model_path=model_path)
+    output = embedding.embed_query(document)
+    assert len(output) == 512
--- a/tests/integration_tests/llms/test_llamacpp.py
+++ b/tests/integration_tests/llms/test_llamacpp.py
@ -0,0 +1,34 @@
+# flake8: noqa
+"""Test Llama.cpp wrapper."""
+import os
+from urllib.request import urlretrieve
+
+from langchain.llms import LlamaCpp
+
+
+def get_model() -> str:
+    """Download model. f
+    From https://huggingface.co/Sosaka/Alpaca-native-4bit-ggml/,
+    convert to new ggml format and return model path."""
+    model_url = "https://huggingface.co/Sosaka/Alpaca-native-4bit-ggml/resolve/main/ggml-alpaca-7b-q4.bin"
+    tokenizer_url = "https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model"
+    conversion_script = "https://github.com/ggerganov/llama.cpp/raw/master/convert-unversioned-ggml-to-ggml.py"
+    local_filename = model_url.split("/")[-1]
+
+    if not os.path.exists("convert-unversioned-ggml-to-ggml.py"):
+        urlretrieve(conversion_script, "convert-unversioned-ggml-to-ggml.py")
+    if not os.path.exists("tokenizer.model"):
+        urlretrieve(tokenizer_url, "tokenizer.model")
+    if not os.path.exists(local_filename):
+        urlretrieve(model_url, local_filename)
+        os.system(f"python convert-unversioned-ggml-to-ggml.py . tokenizer.model")
+
+    return local_filename
+
+
+def test_llamacpp_inference() -> None:
+    """Test valid llama.cpp inference."""
+    model_path = get_model()
+    llm = LlamaCpp(model_path=model_path)
+    output = llm("Say foo:")
+    assert isinstance(output, str)