Bagatur/nuclia vector (#10301)

pull/10302/head^2
Bagatur 1 year ago committed by GitHub
commit 849e345371
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -18,7 +18,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@ -93,8 +93,22 @@
}
],
"metadata": {
"kernelspec": {
"display_name": "langchain",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
},
"orig_nbformat": 4
},

@ -0,0 +1,151 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# NucliaDB vector store\n",
"\n",
"You can use a local NucliaDB instance or use [Nuclia Cloud](https://nuclia.cloud).\n",
"\n",
"When using a local instance, you need a Nuclia Understanding API key, so your texts are properly vectorized and indexed. You can get a key by creating a free account at [https://nuclia.cloud](https://nuclia.cloud), and then [create a NUA key](https://docs.nuclia.dev/docs/docs/using/understanding/intro)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#!pip install nuclia"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Usage with nuclia.cloud"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "nuclia python package not found. Please install it with `pip install nuclia`.",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"File \u001b[0;32m~/dev/osprojects/langchain/libs/langchain/langchain/vectorstores/nucliadb.py:39\u001b[0m, in \u001b[0;36mNucliaDB.__init__\u001b[0;34m(self, knowledge_box, local, api_key, backend)\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 39\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mnuclia\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39msdk\u001b[39;00m \u001b[39mimport\u001b[39;00m NucliaAuth\n\u001b[1;32m 40\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mImportError\u001b[39;00m:\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'nuclia'",
"\nDuring handling of the above exception, another exception occurred:\n",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[1], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mlangchain\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mvectorstores\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mnucliadb\u001b[39;00m \u001b[39mimport\u001b[39;00m NucliaDB\n\u001b[1;32m 2\u001b[0m API_KEY \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mYOUR_API_KEY\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m----> 4\u001b[0m ndb \u001b[39m=\u001b[39m NucliaDB(knowledge_box\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mYOUR_KB_ID\u001b[39;49m\u001b[39m\"\u001b[39;49m, local\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m, api_key\u001b[39m=\u001b[39;49mAPI_KEY)\n",
"File \u001b[0;32m~/dev/osprojects/langchain/libs/langchain/langchain/vectorstores/nucliadb.py:41\u001b[0m, in \u001b[0;36mNucliaDB.__init__\u001b[0;34m(self, knowledge_box, local, api_key, backend)\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mnuclia\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39msdk\u001b[39;00m \u001b[39mimport\u001b[39;00m NucliaAuth\n\u001b[1;32m 40\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mImportError\u001b[39;00m:\n\u001b[0;32m---> 41\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 42\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mnuclia python package not found. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 43\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mPlease install it with `pip install nuclia`.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 44\u001b[0m )\n\u001b[1;32m 45\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_config[\u001b[39m\"\u001b[39m\u001b[39mLOCAL\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m local\n\u001b[1;32m 46\u001b[0m zone \u001b[39m=\u001b[39m os\u001b[39m.\u001b[39menviron\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mNUCLIA_ZONE\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39meurope-1\u001b[39m\u001b[39m\"\u001b[39m)\n",
"\u001b[0;31mValueError\u001b[0m: nuclia python package not found. Please install it with `pip install nuclia`."
]
}
],
"source": [
"from langchain.vectorstores.nucliadb import NucliaDB\n",
"API_KEY = \"YOUR_API_KEY\"\n",
"\n",
"ndb = NucliaDB(knowledge_box=\"YOUR_KB_ID\", local=False, api_key=API_KEY)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Usage with a local instance\n",
"\n",
"Note: By default `backend` is set to `http://localhost:8080`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain.vectorstores.nucliadb import NucliaDB\n",
"\n",
"ndb = NucliaDB(knowledge_box=\"YOUR_KB_ID\", local=True, backend=\"http://my-local-server\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Add and delete texts to your Knowledge Box"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ids = ndb.add_texts([\"This is a new test\", \"This is a second test\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ndb.delete(ids=ids)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Search in your Knowledge Box"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results = ndb.similarity_search(\"Who was inspired by Ada Lovelace?\")\n",
"print(res.page_content)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "langchain",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}

@ -0,0 +1,159 @@
import os
from typing import Any, Dict, Iterable, List, Optional, Type
from langchain.embeddings.base import Embeddings
from langchain.schema.document import Document
from langchain.vectorstores.base import VST, VectorStore
FIELD_TYPES = {
"f": "files",
"t": "texts",
"l": "links",
}
class NucliaDB(VectorStore):
"""NucliaDB vector store."""
_config: Dict[str, Any] = {}
def __init__(
self,
knowledge_box: str,
local: bool,
api_key: Optional[str] = None,
backend: Optional[str] = None,
) -> None:
"""Initialize the NucliaDB client.
Args:
knowledge_box: the Knowledge Box id.
local: Whether to use a local NucliaDB instance or Nuclia Cloud
api_key: A contributor API key for the kb (needed when local is False)
backend: The backend url to use when local is True, defaults to
http://localhost:8080
"""
try:
from nuclia.sdk import NucliaAuth
except ImportError:
raise ValueError(
"nuclia python package not found. "
"Please install it with `pip install nuclia`."
)
self._config["LOCAL"] = local
zone = os.environ.get("NUCLIA_ZONE", "europe-1")
self._kb = knowledge_box
if local:
if not backend:
backend = "http://localhost:8080"
self._config["BACKEND"] = f"{backend}/api/v1"
self._config["TOKEN"] = None
NucliaAuth().nucliadb(url=backend)
NucliaAuth().kb(url=self.kb_url, interactive=False)
else:
self._config["BACKEND"] = f"https://{zone}.nuclia.cloud/api/v1"
self._config["TOKEN"] = api_key
NucliaAuth().kb(
url=self.kb_url, token=self._config["TOKEN"], interactive=False
)
@property
def is_local(self) -> str:
return self._config["LOCAL"]
@property
def kb_url(self) -> str:
return f"{self._config['BACKEND']}/kb/{self._kb}"
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""Upload texts to NucliaDB"""
ids = []
from nuclia.sdk import NucliaResource
factory = NucliaResource()
for i, text in enumerate(texts):
extra: Dict[str, Any] = {"metadata": ""}
if metadatas:
extra = {"metadata": metadatas[i]}
id = factory.create(
texts={"text": {"body": text}},
extra=extra,
url=self.kb_url,
api_key=self._config["TOKEN"],
)
ids.append(id)
return ids
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
if not ids:
return None
from nuclia.sdk import NucliaResource
factory = NucliaResource()
results: List[bool] = []
for id in ids:
try:
factory.delete(rid=id, url=self.kb_url, api_key=self._config["TOKEN"])
results.append(True)
except ValueError:
results.append(False)
return all(results)
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
from nuclia.sdk import NucliaSearch
from nucliadb_models.search import FindRequest, ResourceProperties
request = FindRequest(
query=query,
page_size=k,
show=[ResourceProperties.VALUES, ResourceProperties.EXTRA],
)
search = NucliaSearch()
results = search.find(
query=request, url=self.kb_url, api_key=self._config["TOKEN"]
)
paragraphs = []
for resource in results.resources.values():
for field in resource.fields.values():
for paragraph_id, paragraph in field.paragraphs.items():
info = paragraph_id.split("/")
field_type = FIELD_TYPES.get(info[1], None)
field_id = info[2]
if not field_type:
continue
value = getattr(resource.data, field_type, {}).get(field_id, None)
paragraphs.append(
{
"text": paragraph.text,
"metadata": {
"extra": getattr(
getattr(resource, "extra", {}), "metadata", None
),
"value": value,
},
"order": paragraph.order,
}
)
sorted_paragraphs = sorted(paragraphs, key=lambda x: x["order"])
return [
Document(page_content=paragraph["text"], metadata=paragraph["metadata"])
for paragraph in sorted_paragraphs
]
@classmethod
def from_texts(
cls: Type[VST],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> VST:
"""Return VectorStore initialized from texts and embeddings."""
raise NotImplementedError

File diff suppressed because it is too large Load Diff

@ -0,0 +1,98 @@
from typing import Any
from unittest import mock
from langchain.vectorstores.nucliadb import NucliaDB
class attrdict(dict):
def __getitem__(self, key: str) -> Any:
value = dict.__getitem__(self, key)
return attrdict(value) if isinstance(value, dict) else value
__getattr__ = __getitem__
def FakeCreate(**args: Any) -> Any:
def fn(self: Any, **kwargs: Any) -> str:
return "fake_uuid"
return fn
def FakeDelete(**args: Any) -> Any:
def fn(self: Any, **kwargs: Any) -> None:
return None
return fn
def FakeFind(**args: Any) -> Any:
def fn(self: Any, **kwargs: Any) -> Any:
return attrdict(
{
"resources": {
"123": attrdict(
{
"fields": {
"456": attrdict(
{
"paragraphs": {
"123/t/text/0-14": attrdict(
{
"text": "This is a test",
"order": 0,
}
),
}
}
)
},
"data": {
"texts": {
"text": {
"body": "This is a test",
}
}
},
"extra": attrdict({"metadata": {"some": "metadata"}}),
}
)
}
}
)
return fn
def test_add_texts() -> None:
with mock.patch(
"nuclia.sdk.resource.NucliaResource.create",
new_callable=FakeCreate,
):
ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY")
assert ndb.is_local is False
ids = ndb.add_texts(["This is a new test", "This is a second test"])
assert len(ids) == 2
def test_delete() -> None:
with mock.patch(
"nuclia.sdk.resource.NucliaResource.delete",
new_callable=FakeDelete,
):
ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY")
success = ndb.delete(["123", "456"])
assert success
def test_search() -> None:
with mock.patch(
"nuclia.sdk.search.NucliaSearch.find",
new_callable=FakeFind,
):
ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY")
results = ndb.similarity_search("Who was inspired by Ada Lovelace?")
assert len(results) == 1
assert results[0].page_content == "This is a test"
assert results[0].metadata["extra"]["some"] == "metadata"
assert results[0].metadata["value"]["body"] == "This is a test"
Loading…
Cancel
Save