mirror of https://github.com/hwchase17/langchain
Bagatur/nuclia vector (#10301)
commit
849e345371
@ -0,0 +1,151 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# NucliaDB vector store\n",
|
||||
"\n",
|
||||
"You can use a local NucliaDB instance or use [Nuclia Cloud](https://nuclia.cloud).\n",
|
||||
"\n",
|
||||
"When using a local instance, you need a Nuclia Understanding API key, so your texts are properly vectorized and indexed. You can get a key by creating a free account at [https://nuclia.cloud](https://nuclia.cloud), and then [create a NUA key](https://docs.nuclia.dev/docs/docs/using/understanding/intro)."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#!pip install nuclia"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Usage with nuclia.cloud"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "ValueError",
|
||||
"evalue": "nuclia python package not found. Please install it with `pip install nuclia`.",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
||||
"File \u001b[0;32m~/dev/osprojects/langchain/libs/langchain/langchain/vectorstores/nucliadb.py:39\u001b[0m, in \u001b[0;36mNucliaDB.__init__\u001b[0;34m(self, knowledge_box, local, api_key, backend)\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 39\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mnuclia\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39msdk\u001b[39;00m \u001b[39mimport\u001b[39;00m NucliaAuth\n\u001b[1;32m 40\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mImportError\u001b[39;00m:\n",
|
||||
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'nuclia'",
|
||||
"\nDuring handling of the above exception, another exception occurred:\n",
|
||||
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[1], line 4\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mlangchain\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mvectorstores\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mnucliadb\u001b[39;00m \u001b[39mimport\u001b[39;00m NucliaDB\n\u001b[1;32m 2\u001b[0m API_KEY \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mYOUR_API_KEY\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m----> 4\u001b[0m ndb \u001b[39m=\u001b[39m NucliaDB(knowledge_box\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mYOUR_KB_ID\u001b[39;49m\u001b[39m\"\u001b[39;49m, local\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m, api_key\u001b[39m=\u001b[39;49mAPI_KEY)\n",
|
||||
"File \u001b[0;32m~/dev/osprojects/langchain/libs/langchain/langchain/vectorstores/nucliadb.py:41\u001b[0m, in \u001b[0;36mNucliaDB.__init__\u001b[0;34m(self, knowledge_box, local, api_key, backend)\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mnuclia\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39msdk\u001b[39;00m \u001b[39mimport\u001b[39;00m NucliaAuth\n\u001b[1;32m 40\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mImportError\u001b[39;00m:\n\u001b[0;32m---> 41\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m 42\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mnuclia python package not found. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 43\u001b[0m \u001b[39m\"\u001b[39m\u001b[39mPlease install it with `pip install nuclia`.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 44\u001b[0m )\n\u001b[1;32m 45\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_config[\u001b[39m\"\u001b[39m\u001b[39mLOCAL\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m local\n\u001b[1;32m 46\u001b[0m zone \u001b[39m=\u001b[39m os\u001b[39m.\u001b[39menviron\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mNUCLIA_ZONE\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39meurope-1\u001b[39m\u001b[39m\"\u001b[39m)\n",
|
||||
"\u001b[0;31mValueError\u001b[0m: nuclia python package not found. Please install it with `pip install nuclia`."
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.vectorstores.nucliadb import NucliaDB\n",
|
||||
"API_KEY = \"YOUR_API_KEY\"\n",
|
||||
"\n",
|
||||
"ndb = NucliaDB(knowledge_box=\"YOUR_KB_ID\", local=False, api_key=API_KEY)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Usage with a local instance\n",
|
||||
"\n",
|
||||
"Note: By default `backend` is set to `http://localhost:8080`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.vectorstores.nucliadb import NucliaDB\n",
|
||||
"\n",
|
||||
"ndb = NucliaDB(knowledge_box=\"YOUR_KB_ID\", local=True, backend=\"http://my-local-server\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Add and delete texts to your Knowledge Box"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ids = ndb.add_texts([\"This is a new test\", \"This is a second test\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ndb.delete(ids=ids)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Search in your Knowledge Box"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"results = ndb.similarity_search(\"Who was inspired by Ada Lovelace?\")\n",
|
||||
"print(res.page_content)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "langchain",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.5"
|
||||
},
|
||||
"orig_nbformat": 4
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
@ -0,0 +1,159 @@
|
||||
import os
|
||||
from typing import Any, Dict, Iterable, List, Optional, Type
|
||||
|
||||
from langchain.embeddings.base import Embeddings
|
||||
from langchain.schema.document import Document
|
||||
from langchain.vectorstores.base import VST, VectorStore
|
||||
|
||||
FIELD_TYPES = {
|
||||
"f": "files",
|
||||
"t": "texts",
|
||||
"l": "links",
|
||||
}
|
||||
|
||||
|
||||
class NucliaDB(VectorStore):
|
||||
"""NucliaDB vector store."""
|
||||
|
||||
_config: Dict[str, Any] = {}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
knowledge_box: str,
|
||||
local: bool,
|
||||
api_key: Optional[str] = None,
|
||||
backend: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Initialize the NucliaDB client.
|
||||
|
||||
Args:
|
||||
knowledge_box: the Knowledge Box id.
|
||||
local: Whether to use a local NucliaDB instance or Nuclia Cloud
|
||||
api_key: A contributor API key for the kb (needed when local is False)
|
||||
backend: The backend url to use when local is True, defaults to
|
||||
http://localhost:8080
|
||||
"""
|
||||
try:
|
||||
from nuclia.sdk import NucliaAuth
|
||||
except ImportError:
|
||||
raise ValueError(
|
||||
"nuclia python package not found. "
|
||||
"Please install it with `pip install nuclia`."
|
||||
)
|
||||
self._config["LOCAL"] = local
|
||||
zone = os.environ.get("NUCLIA_ZONE", "europe-1")
|
||||
self._kb = knowledge_box
|
||||
if local:
|
||||
if not backend:
|
||||
backend = "http://localhost:8080"
|
||||
self._config["BACKEND"] = f"{backend}/api/v1"
|
||||
self._config["TOKEN"] = None
|
||||
NucliaAuth().nucliadb(url=backend)
|
||||
NucliaAuth().kb(url=self.kb_url, interactive=False)
|
||||
else:
|
||||
self._config["BACKEND"] = f"https://{zone}.nuclia.cloud/api/v1"
|
||||
self._config["TOKEN"] = api_key
|
||||
NucliaAuth().kb(
|
||||
url=self.kb_url, token=self._config["TOKEN"], interactive=False
|
||||
)
|
||||
|
||||
@property
|
||||
def is_local(self) -> str:
|
||||
return self._config["LOCAL"]
|
||||
|
||||
@property
|
||||
def kb_url(self) -> str:
|
||||
return f"{self._config['BACKEND']}/kb/{self._kb}"
|
||||
|
||||
def add_texts(
|
||||
self,
|
||||
texts: Iterable[str],
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[str]:
|
||||
"""Upload texts to NucliaDB"""
|
||||
ids = []
|
||||
from nuclia.sdk import NucliaResource
|
||||
|
||||
factory = NucliaResource()
|
||||
for i, text in enumerate(texts):
|
||||
extra: Dict[str, Any] = {"metadata": ""}
|
||||
if metadatas:
|
||||
extra = {"metadata": metadatas[i]}
|
||||
id = factory.create(
|
||||
texts={"text": {"body": text}},
|
||||
extra=extra,
|
||||
url=self.kb_url,
|
||||
api_key=self._config["TOKEN"],
|
||||
)
|
||||
ids.append(id)
|
||||
return ids
|
||||
|
||||
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
|
||||
if not ids:
|
||||
return None
|
||||
from nuclia.sdk import NucliaResource
|
||||
|
||||
factory = NucliaResource()
|
||||
results: List[bool] = []
|
||||
for id in ids:
|
||||
try:
|
||||
factory.delete(rid=id, url=self.kb_url, api_key=self._config["TOKEN"])
|
||||
results.append(True)
|
||||
except ValueError:
|
||||
results.append(False)
|
||||
return all(results)
|
||||
|
||||
def similarity_search(
|
||||
self, query: str, k: int = 4, **kwargs: Any
|
||||
) -> List[Document]:
|
||||
from nuclia.sdk import NucliaSearch
|
||||
from nucliadb_models.search import FindRequest, ResourceProperties
|
||||
|
||||
request = FindRequest(
|
||||
query=query,
|
||||
page_size=k,
|
||||
show=[ResourceProperties.VALUES, ResourceProperties.EXTRA],
|
||||
)
|
||||
search = NucliaSearch()
|
||||
results = search.find(
|
||||
query=request, url=self.kb_url, api_key=self._config["TOKEN"]
|
||||
)
|
||||
paragraphs = []
|
||||
for resource in results.resources.values():
|
||||
for field in resource.fields.values():
|
||||
for paragraph_id, paragraph in field.paragraphs.items():
|
||||
info = paragraph_id.split("/")
|
||||
field_type = FIELD_TYPES.get(info[1], None)
|
||||
field_id = info[2]
|
||||
if not field_type:
|
||||
continue
|
||||
value = getattr(resource.data, field_type, {}).get(field_id, None)
|
||||
paragraphs.append(
|
||||
{
|
||||
"text": paragraph.text,
|
||||
"metadata": {
|
||||
"extra": getattr(
|
||||
getattr(resource, "extra", {}), "metadata", None
|
||||
),
|
||||
"value": value,
|
||||
},
|
||||
"order": paragraph.order,
|
||||
}
|
||||
)
|
||||
sorted_paragraphs = sorted(paragraphs, key=lambda x: x["order"])
|
||||
return [
|
||||
Document(page_content=paragraph["text"], metadata=paragraph["metadata"])
|
||||
for paragraph in sorted_paragraphs
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def from_texts(
|
||||
cls: Type[VST],
|
||||
texts: List[str],
|
||||
embedding: Embeddings,
|
||||
metadatas: Optional[List[dict]] = None,
|
||||
**kwargs: Any,
|
||||
) -> VST:
|
||||
"""Return VectorStore initialized from texts and embeddings."""
|
||||
raise NotImplementedError
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,98 @@
|
||||
from typing import Any
|
||||
from unittest import mock
|
||||
|
||||
from langchain.vectorstores.nucliadb import NucliaDB
|
||||
|
||||
|
||||
class attrdict(dict):
|
||||
def __getitem__(self, key: str) -> Any:
|
||||
value = dict.__getitem__(self, key)
|
||||
return attrdict(value) if isinstance(value, dict) else value
|
||||
|
||||
__getattr__ = __getitem__
|
||||
|
||||
|
||||
def FakeCreate(**args: Any) -> Any:
|
||||
def fn(self: Any, **kwargs: Any) -> str:
|
||||
return "fake_uuid"
|
||||
|
||||
return fn
|
||||
|
||||
|
||||
def FakeDelete(**args: Any) -> Any:
|
||||
def fn(self: Any, **kwargs: Any) -> None:
|
||||
return None
|
||||
|
||||
return fn
|
||||
|
||||
|
||||
def FakeFind(**args: Any) -> Any:
|
||||
def fn(self: Any, **kwargs: Any) -> Any:
|
||||
return attrdict(
|
||||
{
|
||||
"resources": {
|
||||
"123": attrdict(
|
||||
{
|
||||
"fields": {
|
||||
"456": attrdict(
|
||||
{
|
||||
"paragraphs": {
|
||||
"123/t/text/0-14": attrdict(
|
||||
{
|
||||
"text": "This is a test",
|
||||
"order": 0,
|
||||
}
|
||||
),
|
||||
}
|
||||
}
|
||||
)
|
||||
},
|
||||
"data": {
|
||||
"texts": {
|
||||
"text": {
|
||||
"body": "This is a test",
|
||||
}
|
||||
}
|
||||
},
|
||||
"extra": attrdict({"metadata": {"some": "metadata"}}),
|
||||
}
|
||||
)
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
return fn
|
||||
|
||||
|
||||
def test_add_texts() -> None:
|
||||
with mock.patch(
|
||||
"nuclia.sdk.resource.NucliaResource.create",
|
||||
new_callable=FakeCreate,
|
||||
):
|
||||
ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY")
|
||||
assert ndb.is_local is False
|
||||
ids = ndb.add_texts(["This is a new test", "This is a second test"])
|
||||
assert len(ids) == 2
|
||||
|
||||
|
||||
def test_delete() -> None:
|
||||
with mock.patch(
|
||||
"nuclia.sdk.resource.NucliaResource.delete",
|
||||
new_callable=FakeDelete,
|
||||
):
|
||||
ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY")
|
||||
success = ndb.delete(["123", "456"])
|
||||
assert success
|
||||
|
||||
|
||||
def test_search() -> None:
|
||||
with mock.patch(
|
||||
"nuclia.sdk.search.NucliaSearch.find",
|
||||
new_callable=FakeFind,
|
||||
):
|
||||
ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY")
|
||||
results = ndb.similarity_search("Who was inspired by Ada Lovelace?")
|
||||
assert len(results) == 1
|
||||
assert results[0].page_content == "This is a test"
|
||||
assert results[0].metadata["extra"]["some"] == "metadata"
|
||||
assert results[0].metadata["value"]["body"] == "This is a test"
|
Loading…
Reference in New Issue