From d85f57ef9cbbbd5e512e064fb81c531b28c6591c Mon Sep 17 00:00:00 2001
From: Harrison Chase <hw.chase.17@gmail.com>
Date: Sun, 2 Apr 2023 14:57:45 -0700
Subject: [PATCH] Harrison/llama (#2314)

Co-authored-by: RJ Adriaansen <adriaansen@eshcc.eur.nl>
---
 docs/ecosystem/llamacpp.md                    |  26 +++
 .../models/llms/integrations/llamacpp.ipynb   |  98 ++++++++++
 .../text_embedding/examples/llamacpp.ipynb    |  66 +++++++
 docs/reference/integrations.md                |   3 +
 langchain/__init__.py                         |   2 +
 langchain/embeddings/__init__.py              |   2 +
 langchain/embeddings/llamacpp.py              | 118 +++++++++++
 langchain/llms/__init__.py                    |   3 +
 langchain/llms/llamacpp.py                    | 184 ++++++++++++++++++
 .../embeddings/test_llamacpp.py               |  46 +++++
 tests/integration_tests/llms/test_llamacpp.py |  34 ++++
 11 files changed, 582 insertions(+)
 create mode 100644 docs/ecosystem/llamacpp.md
 create mode 100644 docs/modules/models/llms/integrations/llamacpp.ipynb
 create mode 100644 docs/modules/models/text_embedding/examples/llamacpp.ipynb
 create mode 100644 langchain/embeddings/llamacpp.py
 create mode 100644 langchain/llms/llamacpp.py
 create mode 100644 tests/integration_tests/embeddings/test_llamacpp.py
 create mode 100644 tests/integration_tests/llms/test_llamacpp.py

diff --git a/docs/ecosystem/llamacpp.md b/docs/ecosystem/llamacpp.md
new file mode 100644
index 00000000..fa89aed0
--- /dev/null
+++ b/docs/ecosystem/llamacpp.md
@@ -0,0 +1,26 @@
+# Llama.cpp
+
+This page covers how to use [llama.cpp](https://github.com/ggerganov/llama.cpp) within LangChain.
+It is broken into two parts: installation and setup, and then references to specific Jina wrappers.
+
+## Installation and Setup
+- Install the Python package with `pip install llama-cpp-python`
+- Download one of the [supported models](https://github.com/ggerganov/llama.cpp#description) and convert them to the llama.cpp format per the [instructions](https://github.com/ggerganov/llama.cpp)
+
+## Wrappers
+
+### LLM
+
+There exists a LlamaCpp LLM wrapper, which you can access with 
+```python
+from langchain.llms import LlamaCpp
+```
+For a more detailed walkthrough of this, see [this notebook](../modules/models/text_embedding/examples/llamacpp.ipynb)
+
+### Embeddings
+
+There exists a LlamaCpp Embeddings wrapper, which you can access with 
+```python
+from langchain.embeddings import LlamaCppEmbeddings
+```
+For a more detailed walkthrough of this, see [this notebook](../modules/models/llms/integrations/examples/llamacpp.ipynb)
diff --git a/docs/modules/models/llms/integrations/llamacpp.ipynb b/docs/modules/models/llms/integrations/llamacpp.ipynb
new file mode 100644
index 00000000..3d0d9a10
--- /dev/null
+++ b/docs/modules/models/llms/integrations/llamacpp.ipynb
@@ -0,0 +1,98 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install llama-cpp-python"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.llms import LlamaCpp\n",
+    "from langchain import PromptTemplate, LLMChain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "template = \"\"\"Question: {question}\n",
+    "\n",
+    "Answer: Let's think step by step.\"\"\"\n",
+    "\n",
+    "prompt = PromptTemplate(template=template, input_variables=[\"question\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm = LlamaCpp(model_path=\"./ggml-model-q4_0.bin\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llm_chain = LLMChain(prompt=prompt, llm=llm)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'\\n\\nWe know that Justin Bieber is currently 25 years old and that he was born on March 1st, 1994 and that he is a singer and he has an album called Purpose, so we know that he was born when Super Bowl XXXVIII was played between Dallas and Seattle and that it took place February 1st, 2004 and that the Seattle Seahawks won 24-21, so Seattle is our answer!'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "question = \"What NFL team won the Super Bowl in the year Justin Bieber was born?\"\n",
+    "\n",
+    "llm_chain.run(question)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "workspace",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/modules/models/text_embedding/examples/llamacpp.ipynb b/docs/modules/models/text_embedding/examples/llamacpp.ipynb
new file mode 100644
index 00000000..0d868c8d
--- /dev/null
+++ b/docs/modules/models/text_embedding/examples/llamacpp.ipynb
@@ -0,0 +1,66 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install llama-cpp-python"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.embeddings import LlamaCppEmbeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "llama = LlamaCppEmbeddings(model_path=\"/path/to/model/ggml-model-q4_0.bin\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"This is a test document.\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query_result = embeddings.embed_query(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "doc_result = embeddings.embed_documents([text])"
+   ]
+  }
+ ],
+ "metadata": {
+  "language_info": {
+   "name": "python"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/docs/reference/integrations.md b/docs/reference/integrations.md
index 2691358b..1b87a34e 100644
--- a/docs/reference/integrations.md
+++ b/docs/reference/integrations.md
@@ -52,6 +52,9 @@ The following use cases require specific installs and api keys:
   - If you want to set up OpenSearch on your local, [here](https://opensearch.org/docs/latest/)
 - _DeepLake_:
   - Install requirements with `pip install deeplake`
+- _LlamaCpp_:
+  - Install requirements with `pip install llama-cpp-python`
+  - Download model and convert following [llama.cpp instructions](https://github.com/ggerganov/llama.cpp)
 
 
 If you are using the `NLTKTextSplitter` or the `SpacyTextSplitter`, you will also need to install the appropriate models. For example, if you want to use the `SpacyTextSplitter`, you will need to install the `en_core_web_sm` model with `python -m spacy download en_core_web_sm`. Similarly, if you want to use the `NLTKTextSplitter`, you will need to install the `punkt` model with `python -m nltk.downloader punkt`.
diff --git a/langchain/__init__.py b/langchain/__init__.py
index a03ce602..b185ca52 100644
--- a/langchain/__init__.py
+++ b/langchain/__init__.py
@@ -31,6 +31,7 @@ from langchain.llms import (
     ForefrontAI,
     GooseAI,
     HuggingFaceHub,
+    LlamaCpp,
     Modal,
     OpenAI,
     Petals,
@@ -110,4 +111,5 @@ __all__ = [
     "PALChain",
     "set_handler",
     "set_tracing_callback_manager",
+    "LlamaCpp",
 ]
diff --git a/langchain/embeddings/__init__.py b/langchain/embeddings/__init__.py
index c1b33d7a..b46c9de4 100644
--- a/langchain/embeddings/__init__.py
+++ b/langchain/embeddings/__init__.py
@@ -14,6 +14,7 @@ from langchain.embeddings.huggingface import (
 )
 from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
 from langchain.embeddings.jina import JinaEmbeddings
+from langchain.embeddings.llamacpp import LlamaCppEmbeddings
 from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.embeddings.sagemaker_endpoint import SagemakerEndpointEmbeddings
 from langchain.embeddings.self_hosted import SelfHostedEmbeddings
@@ -30,6 +31,7 @@ __all__ = [
     "HuggingFaceEmbeddings",
     "CohereEmbeddings",
     "JinaEmbeddings",
+    "LlamaCppEmbeddings",
     "HuggingFaceHubEmbeddings",
     "TensorflowHubEmbeddings",
     "SagemakerEndpointEmbeddings",
diff --git a/langchain/embeddings/llamacpp.py b/langchain/embeddings/llamacpp.py
new file mode 100644
index 00000000..8b8c6c54
--- /dev/null
+++ b/langchain/embeddings/llamacpp.py
@@ -0,0 +1,118 @@
+"""Wrapper around llama.cpp embedding models."""
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, Extra, Field, root_validator
+
+from langchain.embeddings.base import Embeddings
+
+
+class LlamaCppEmbeddings(BaseModel, Embeddings):
+    """Wrapper around llama.cpp embedding models.
+
+    To use, you should have the llama-cpp-python library installed, and provide the
+    path to the Llama model as a named parameter to the constructor.
+    Check out: https://github.com/abetlen/llama-cpp-python
+
+    Example:
+        .. code-block:: python
+
+            from langchain.embeddings import LlamaCppEmbeddings
+            llama = LlamaCppEmbeddings(model_path="/path/to/model.bin")
+    """
+
+    client: Any  #: :meta private:
+    model_path: str
+
+    n_ctx: int = Field(512, alias="n_ctx")
+    """Token context window."""
+
+    n_parts: int = Field(-1, alias="n_parts")
+    """Number of parts to split the model into. 
+    If -1, the number of parts is automatically determined."""
+
+    seed: int = Field(-1, alias="seed")
+    """Seed. If -1, a random seed is used."""
+
+    f16_kv: bool = Field(False, alias="f16_kv")
+    """Use half-precision for key/value cache."""
+
+    logits_all: bool = Field(False, alias="logits_all")
+    """Return logits for all tokens, not just the last token."""
+
+    vocab_only: bool = Field(False, alias="vocab_only")
+    """Only load the vocabulary, no weights."""
+
+    use_mlock: bool = Field(False, alias="use_mlock")
+    """Force system to keep model in RAM."""
+
+    n_threads: Optional[int] = Field(None, alias="n_threads")
+    """Number of threads to use. If None, the number 
+    of threads is automatically determined."""
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.forbid
+
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that llama-cpp-python library is installed."""
+        model_path = values["model_path"]
+        n_ctx = values["n_ctx"]
+        n_parts = values["n_parts"]
+        seed = values["seed"]
+        f16_kv = values["f16_kv"]
+        logits_all = values["logits_all"]
+        vocab_only = values["vocab_only"]
+        use_mlock = values["use_mlock"]
+        n_threads = values["n_threads"]
+
+        try:
+            from llama_cpp import Llama
+
+            values["client"] = Llama(
+                model_path=model_path,
+                n_ctx=n_ctx,
+                n_parts=n_parts,
+                seed=seed,
+                f16_kv=f16_kv,
+                logits_all=logits_all,
+                vocab_only=vocab_only,
+                use_mlock=use_mlock,
+                n_threads=n_threads,
+                embedding=True,
+            )
+        except ImportError:
+            raise ModuleNotFoundError(
+                "Could not import llama-cpp-python library. "
+                "Please install the llama-cpp-python library to "
+                "use this embedding model: pip install llama-cpp-python"
+            )
+        except Exception:
+            raise NameError(f"Could not load Llama model from path: {model_path}")
+
+        return values
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed a list of documents using the Llama model.
+
+        Args:
+            texts: The list of texts to embed.
+
+        Returns:
+            List of embeddings, one for each text.
+        """
+        embeddings = [self.client.embed(text) for text in texts]
+        return [list(map(float, e)) for e in embeddings]
+
+    def embed_query(self, text: str) -> List[float]:
+        """Embed a query using the Llama model.
+
+        Args:
+            text: The text to embed.
+
+        Returns:
+            Embeddings for the text.
+        """
+        embedding = self.client.embed(text)
+        return list(map(float, embedding))
diff --git a/langchain/llms/__init__.py b/langchain/llms/__init__.py
index 1cc5d0f0..b5acdc12 100644
--- a/langchain/llms/__init__.py
+++ b/langchain/llms/__init__.py
@@ -14,6 +14,7 @@ from langchain.llms.gooseai import GooseAI
 from langchain.llms.huggingface_endpoint import HuggingFaceEndpoint
 from langchain.llms.huggingface_hub import HuggingFaceHub
 from langchain.llms.huggingface_pipeline import HuggingFacePipeline
+from langchain.llms.llamacpp import LlamaCpp
 from langchain.llms.modal import Modal
 from langchain.llms.nlpcloud import NLPCloud
 from langchain.llms.openai import AzureOpenAI, OpenAI, OpenAIChat
@@ -35,6 +36,7 @@ __all__ = [
     "DeepInfra",
     "ForefrontAI",
     "GooseAI",
+    "LlamaCpp",
     "Modal",
     "NLPCloud",
     "OpenAI",
@@ -67,6 +69,7 @@ type_to_cls_dict: Dict[str, Type[BaseLLM]] = {
     "gooseai": GooseAI,
     "huggingface_hub": HuggingFaceHub,
     "huggingface_endpoint": HuggingFaceEndpoint,
+    "llamacpp": LlamaCpp,
     "modal": Modal,
     "sagemaker_endpoint": SagemakerEndpoint,
     "nlpcloud": NLPCloud,
diff --git a/langchain/llms/llamacpp.py b/langchain/llms/llamacpp.py
new file mode 100644
index 00000000..536755fe
--- /dev/null
+++ b/langchain/llms/llamacpp.py
@@ -0,0 +1,184 @@
+"""Wrapper around llama.cpp."""
+import logging
+from typing import Any, Dict, List, Optional
+
+from pydantic import BaseModel, Field, root_validator
+
+from langchain.llms.base import LLM
+
+logger = logging.getLogger(__name__)
+
+
+class LlamaCpp(LLM, BaseModel):
+    """Wrapper around the llama.cpp model.
+
+    To use, you should have the llama-cpp-python library installed, and provide the
+    path to the Llama model as a named parameter to the constructor.
+    Check out: https://github.com/abetlen/llama-cpp-python
+
+    Example:
+        .. code-block:: python
+
+            from langchain.llms import LlamaCppEmbeddings
+            llm = LlamaCppEmbeddings(model_path="/path/to/llama/model")
+    """
+
+    client: Any  #: :meta private:
+    model_path: str
+    """The path to the Llama model file."""
+
+    n_ctx: int = Field(512, alias="n_ctx")
+    """Token context window."""
+
+    n_parts: int = Field(-1, alias="n_parts")
+    """Number of parts to split the model into. 
+    If -1, the number of parts is automatically determined."""
+
+    seed: int = Field(-1, alias="seed")
+    """Seed. If -1, a random seed is used."""
+
+    f16_kv: bool = Field(False, alias="f16_kv")
+    """Use half-precision for key/value cache."""
+
+    logits_all: bool = Field(False, alias="logits_all")
+    """Return logits for all tokens, not just the last token."""
+
+    vocab_only: bool = Field(False, alias="vocab_only")
+    """Only load the vocabulary, no weights."""
+
+    use_mlock: bool = Field(False, alias="use_mlock")
+    """Force system to keep model in RAM."""
+
+    n_threads: Optional[int] = Field(None, alias="n_threads")
+    """Number of threads to use. 
+    If None, the number of threads is automatically determined."""
+
+    suffix: Optional[str] = Field(None)
+    """A suffix to append to the generated text. If None, no suffix is appended."""
+
+    max_tokens: Optional[int] = 256
+    """The maximum number of tokens to generate."""
+
+    temperature: Optional[float] = 0.8
+    """The temperature to use for sampling."""
+
+    top_p: Optional[float] = 0.95
+    """The top-p value to use for sampling."""
+
+    logprobs: Optional[int] = Field(None)
+    """The number of logprobs to return. If None, no logprobs are returned."""
+
+    echo: Optional[bool] = False
+    """Whether to echo the prompt."""
+
+    stop: Optional[List[str]] = []
+    """A list of strings to stop generation when encountered."""
+
+    repeat_penalty: Optional[float] = 1.1
+    """The penalty to apply to repeated tokens."""
+
+    top_k: Optional[int] = 40
+    """The top-k value to use for sampling."""
+
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that llama-cpp-python library is installed."""
+        model_path = values["model_path"]
+        n_ctx = values["n_ctx"]
+        n_parts = values["n_parts"]
+        seed = values["seed"]
+        f16_kv = values["f16_kv"]
+        logits_all = values["logits_all"]
+        vocab_only = values["vocab_only"]
+        use_mlock = values["use_mlock"]
+        n_threads = values["n_threads"]
+
+        try:
+            from llama_cpp import Llama
+
+            values["client"] = Llama(
+                model_path=model_path,
+                n_ctx=n_ctx,
+                n_parts=n_parts,
+                seed=seed,
+                f16_kv=f16_kv,
+                logits_all=logits_all,
+                vocab_only=vocab_only,
+                use_mlock=use_mlock,
+                n_threads=n_threads,
+            )
+        except ImportError:
+            raise ModuleNotFoundError(
+                "Could not import llama-cpp-python library. "
+                "Please install the llama-cpp-python library to "
+                "use this embedding model: pip install llama-cpp-python"
+            )
+        except Exception:
+            raise NameError(f"Could not load Llama model from path: {model_path}")
+
+        return values
+
+    @property
+    def _default_params(self) -> Dict[str, Any]:
+        """Get the default parameters for calling llama_cpp."""
+        return {
+            "suffix": self.suffix,
+            "max_tokens": self.max_tokens,
+            "temperature": self.temperature,
+            "top_p": self.top_p,
+            "logprobs": self.logprobs,
+            "echo": self.echo,
+            "stop_sequences": self.stop,
+            "repeat_penalty": self.repeat_penalty,
+            "top_k": self.top_k,
+        }
+
+    @property
+    def _identifying_params(self) -> Dict[str, Any]:
+        """Get the identifying parameters."""
+        return {**{"model_path": self.model_path}, **self._default_params}
+
+    @property
+    def _llm_type(self) -> str:
+        """Return type of llm."""
+        return "llama.cpp"
+
+    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
+        """Call the Llama model and return the output.
+
+        Args:
+            prompt: The prompt to use for generation.
+            stop: A list of strings to stop generation when encountered.
+
+        Returns:
+            The generated text.
+
+        Example:
+            .. code-block:: python
+
+                from langchain.llms import LlamaCppEmbeddings
+                llm = LlamaCppEmbeddings(model_path="/path/to/local/llama/model.bin")
+                llm("This is a prompt.")
+        """
+
+        params = self._default_params
+        if self.stop and stop is not None:
+            raise ValueError("`stop` found in both the input and default params.")
+        elif self.stop:
+            params["stop_sequences"] = self.stop
+        else:
+            params["stop_sequences"] = []
+
+        """Call the Llama model and return the output."""
+        text = self.client(
+            prompt=prompt,
+            max_tokens=params["max_tokens"],
+            temperature=params["temperature"],
+            top_p=params["top_p"],
+            logprobs=params["logprobs"],
+            echo=params["echo"],
+            stop=params["stop_sequences"],
+            repeat_penalty=params["repeat_penalty"],
+            top_k=params["top_k"],
+        )
+        return text["choices"][0]["text"]
diff --git a/tests/integration_tests/embeddings/test_llamacpp.py b/tests/integration_tests/embeddings/test_llamacpp.py
new file mode 100644
index 00000000..36aed8e9
--- /dev/null
+++ b/tests/integration_tests/embeddings/test_llamacpp.py
@@ -0,0 +1,46 @@
+# flake8: noqa
+"""Test llamacpp embeddings."""
+import os
+from urllib.request import urlretrieve
+
+from langchain.embeddings.llamacpp import LlamaCppEmbeddings
+
+
+def get_model() -> str:
+    """Download model.
+    From https://huggingface.co/Sosaka/Alpaca-native-4bit-ggml/,
+    convert to new ggml format and return model path.
+    """
+    model_url = "https://huggingface.co/Sosaka/Alpaca-native-4bit-ggml/resolve/main/ggml-alpaca-7b-q4.bin"
+    tokenizer_url = "https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model"
+    conversion_script = "https://github.com/ggerganov/llama.cpp/raw/master/convert-unversioned-ggml-to-ggml.py"
+    local_filename = model_url.split("/")[-1]
+
+    if not os.path.exists("convert-unversioned-ggml-to-ggml.py"):
+        urlretrieve(conversion_script, "convert-unversioned-ggml-to-ggml.py")
+    if not os.path.exists("tokenizer.model"):
+        urlretrieve(tokenizer_url, "tokenizer.model")
+    if not os.path.exists(local_filename):
+        urlretrieve(model_url, local_filename)
+        os.system("python convert-unversioned-ggml-to-ggml.py . tokenizer.model")
+
+    return local_filename
+
+
+def test_llamacpp_embedding_documents() -> None:
+    """Test llamacpp embeddings."""
+    documents = ["foo bar"]
+    model_path = get_model()
+    embedding = LlamaCppEmbeddings(model_path=model_path)
+    output = embedding.embed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) == 512
+
+
+def test_llamacpp_embedding_query() -> None:
+    """Test llamacpp embeddings."""
+    document = "foo bar"
+    model_path = get_model()
+    embedding = LlamaCppEmbeddings(model_path=model_path)
+    output = embedding.embed_query(document)
+    assert len(output) == 512
diff --git a/tests/integration_tests/llms/test_llamacpp.py b/tests/integration_tests/llms/test_llamacpp.py
new file mode 100644
index 00000000..11758aa6
--- /dev/null
+++ b/tests/integration_tests/llms/test_llamacpp.py
@@ -0,0 +1,34 @@
+# flake8: noqa
+"""Test Llama.cpp wrapper."""
+import os
+from urllib.request import urlretrieve
+
+from langchain.llms import LlamaCpp
+
+
+def get_model() -> str:
+    """Download model. f
+    From https://huggingface.co/Sosaka/Alpaca-native-4bit-ggml/,
+    convert to new ggml format and return model path."""
+    model_url = "https://huggingface.co/Sosaka/Alpaca-native-4bit-ggml/resolve/main/ggml-alpaca-7b-q4.bin"
+    tokenizer_url = "https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model"
+    conversion_script = "https://github.com/ggerganov/llama.cpp/raw/master/convert-unversioned-ggml-to-ggml.py"
+    local_filename = model_url.split("/")[-1]
+
+    if not os.path.exists("convert-unversioned-ggml-to-ggml.py"):
+        urlretrieve(conversion_script, "convert-unversioned-ggml-to-ggml.py")
+    if not os.path.exists("tokenizer.model"):
+        urlretrieve(tokenizer_url, "tokenizer.model")
+    if not os.path.exists(local_filename):
+        urlretrieve(model_url, local_filename)
+        os.system(f"python convert-unversioned-ggml-to-ggml.py . tokenizer.model")
+
+    return local_filename
+
+
+def test_llamacpp_inference() -> None:
+    """Test valid llama.cpp inference."""
+    model_path = get_model()
+    llm = LlamaCpp(model_path=model_path)
+    output = llm("Say foo:")
+    assert isinstance(output, str)