langchain/libs/community/tests/unit_tests/llms/test_llamafile.py
Kate Silverstein 0bc4a9b3fc
community[minor]: Adds Llamafile as an LLM (#17431)
* **Description:** Adds a simple LLM implementation for interacting with
[llamafile](https://github.com/Mozilla-Ocho/llamafile)-based models.
* **Dependencies:** N/A
* **Issue:** N/A

**Detail**
[llamafile](https://github.com/Mozilla-Ocho/llamafile) lets you run LLMs
locally from a single file on most computers without installing any
dependencies.

To use the llamafile LLM implementation, the user needs to:

1. Download a llamafile e.g.
https://huggingface.co/jartine/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/TinyLlama-1.1B-Chat-v1.0.Q5_K_M.llamafile?download=true
2. Make the file executable.
3. Run the llamafile in 'server mode'. (All llamafiles come packaged
with a lightweight server; by default, the server listens at
`http://localhost:8080`.)


```bash
wget https://url/of/model.llamafile
chmod +x model.llamafile
./model.llamafile --server --nobrowser
```

Now, the user can invoke the LLM via the LangChain client:

```python
from langchain_community.llms.llamafile import Llamafile

llm = Llamafile()

llm.invoke("Tell me a joke.")
```
2024-02-14 11:15:24 -08:00

159 lines
4.7 KiB
Python

import json
from collections import deque
from typing import Any, Dict
import pytest
import requests
from pytest import MonkeyPatch
from langchain_community.llms.llamafile import Llamafile
def default_generation_params() -> Dict[str, Any]:
return {
"temperature": 0.8,
"seed": -1,
"top_k": 40,
"top_p": 0.95,
"min_p": 0.05,
"n_predict": -1,
"n_keep": 0,
"tfs_z": 1.0,
"typical_p": 1.0,
"repeat_penalty": 1.1,
"repeat_last_n": 64,
"penalize_nl": True,
"presence_penalty": 0.0,
"frequency_penalty": 0.0,
"mirostat": 0,
"mirostat_tau": 5.0,
"mirostat_eta": 0.1,
}
def mock_response() -> requests.Response:
contents = json.dumps({"content": "the quick brown fox"})
response = requests.Response()
response.status_code = 200
response._content = str.encode(contents)
return response
def mock_response_stream(): # type: ignore[no-untyped-def]
mock_response = deque(
[
b'data: {"content":"the","multimodal":false,"slot_id":0,"stop":false}\n\n', # noqa
b'data: {"content":" quick","multimodal":false,"slot_id":0,"stop":false}\n\n', # noqa
]
)
class MockRaw:
def read(self, chunk_size): # type: ignore[no-untyped-def]
try:
return mock_response.popleft()
except IndexError:
return None
response = requests.Response()
response.status_code = 200
response.raw = MockRaw()
return response
def test_call(monkeypatch: MonkeyPatch) -> None:
"""
Test basic functionality of the `invoke` method
"""
llm = Llamafile(
base_url="http://llamafile-host:8080",
)
def mock_post(url, headers, json, stream, timeout): # type: ignore[no-untyped-def]
assert url == "http://llamafile-host:8080/completion"
assert headers == {
"Content-Type": "application/json",
}
# 'unknown' kwarg should be ignored
assert json == {"prompt": "Test prompt", **default_generation_params()}
assert stream is False
assert timeout is None
return mock_response()
monkeypatch.setattr(requests, "post", mock_post)
out = llm.invoke("Test prompt")
assert out == "the quick brown fox"
def test_call_with_kwargs(monkeypatch: MonkeyPatch) -> None:
"""
Test kwargs passed to `invoke` override the default values and are passed
to the endpoint correctly. Also test that any 'unknown' kwargs that are not
present in the LLM class attrs are ignored.
"""
llm = Llamafile(
base_url="http://llamafile-host:8080",
)
def mock_post(url, headers, json, stream, timeout): # type: ignore[no-untyped-def]
assert url == "http://llamafile-host:8080/completion"
assert headers == {
"Content-Type": "application/json",
}
# 'unknown' kwarg should be ignored
expected = {"prompt": "Test prompt", **default_generation_params()}
expected["seed"] = 0
assert json == expected
assert stream is False
assert timeout is None
return mock_response()
monkeypatch.setattr(requests, "post", mock_post)
out = llm.invoke(
"Test prompt",
unknown="unknown option", # should be ignored
seed=0, # should override the default
)
assert out == "the quick brown fox"
def test_call_raises_exception_on_missing_server(monkeypatch: MonkeyPatch) -> None:
"""
Test that the LLM raises a ConnectionError when no llamafile server is
listening at the base_url.
"""
llm = Llamafile(
# invalid url, nothing should actually be running here
base_url="http://llamafile-host:8080",
)
with pytest.raises(requests.exceptions.ConnectionError):
llm.invoke("Test prompt")
def test_streaming(monkeypatch: MonkeyPatch) -> None:
"""
Test basic functionality of `invoke` with streaming enabled.
"""
llm = Llamafile(
base_url="http://llamafile-hostname:8080",
streaming=True,
)
def mock_post(url, headers, json, stream, timeout): # type: ignore[no-untyped-def]
assert url == "http://llamafile-hostname:8080/completion"
assert headers == {
"Content-Type": "application/json",
}
# 'unknown' kwarg should be ignored
assert "unknown" not in json
expected = {"prompt": "Test prompt", **default_generation_params()}
expected["stream"] = True
assert json == expected
assert stream is True
assert timeout is None
return mock_response_stream()
monkeypatch.setattr(requests, "post", mock_post)
out = llm.invoke("Test prompt")
assert out == "the quick"