Bagatur/nuclia vector (#10301)

1 year ago · 849e345371
parent b64a443f72 0c760f184c
commit 849e345371
5 changed files with 547 additions and 561 deletions
--- a/docs/extras/integrations/document_transformers/nuclia_transformer.ipynb
+++ b/docs/extras/integrations/document_transformers/nuclia_transformer.ipynb
@ -18,7 +18,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
@ -93,8 +93,22 @@
  }
 ],
 "metadata": {
+  "kernelspec": {
+   "display_name": "langchain",
+   "language": "python",
+   "name": "python3"
+  },
  "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.5"
  },
  "orig_nbformat": 4
 },
--- a/docs/extras/integrations/vectorstores/nucliadb_vectorstore.ipynb
+++ b/docs/extras/integrations/vectorstores/nucliadb_vectorstore.ipynb
@ -0,0 +1,151 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# NucliaDB vector store\n",
+    "\n",
+    "You can use a local NucliaDB instance or use [Nuclia Cloud](https://nuclia.cloud).\n",
+    "\n",
+    "When using a local instance, you need a Nuclia Understanding API key, so your texts are properly vectorized and indexed. You can get a key by creating a free account at [https://nuclia.cloud](https://nuclia.cloud), and then [create a NUA key](https://docs.nuclia.dev/docs/docs/using/understanding/intro)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#!pip install nuclia"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Usage with nuclia.cloud"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "nuclia python package not found. Please install it with `pip install nuclia`.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "File \u001b[0;32m~/dev/osprojects/langchain/libs/langchain/langchain/vectorstores/nucliadb.py:39\u001b[0m, in \u001b[0;36mNucliaDB.__init__\u001b[0;34m(self, knowledge_box, local, api_key, backend)\u001b[0m\n\u001b[1;32m     38\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 39\u001b[0m     \u001b[39mfrom\u001b[39;00m \u001b[39mnuclia\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39msdk\u001b[39;00m \u001b[39mimport\u001b[39;00m NucliaAuth\n\u001b[1;32m     40\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mImportError\u001b[39;00m:\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'nuclia'",
+      "\nDuring handling of the above exception, another exception occurred:\n",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 4\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mlangchain\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mvectorstores\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39mnucliadb\u001b[39;00m \u001b[39mimport\u001b[39;00m NucliaDB\n\u001b[1;32m      2\u001b[0m API_KEY \u001b[39m=\u001b[39m \u001b[39m\"\u001b[39m\u001b[39mYOUR_API_KEY\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[0;32m----> 4\u001b[0m ndb \u001b[39m=\u001b[39m NucliaDB(knowledge_box\u001b[39m=\u001b[39;49m\u001b[39m\"\u001b[39;49m\u001b[39mYOUR_KB_ID\u001b[39;49m\u001b[39m\"\u001b[39;49m, local\u001b[39m=\u001b[39;49m\u001b[39mFalse\u001b[39;49;00m, api_key\u001b[39m=\u001b[39;49mAPI_KEY)\n",
+      "File \u001b[0;32m~/dev/osprojects/langchain/libs/langchain/langchain/vectorstores/nucliadb.py:41\u001b[0m, in \u001b[0;36mNucliaDB.__init__\u001b[0;34m(self, knowledge_box, local, api_key, backend)\u001b[0m\n\u001b[1;32m     39\u001b[0m     \u001b[39mfrom\u001b[39;00m \u001b[39mnuclia\u001b[39;00m\u001b[39m.\u001b[39;00m\u001b[39msdk\u001b[39;00m \u001b[39mimport\u001b[39;00m NucliaAuth\n\u001b[1;32m     40\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mImportError\u001b[39;00m:\n\u001b[0;32m---> 41\u001b[0m     \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\n\u001b[1;32m     42\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mnuclia python package not found. \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m     43\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mPlease install it with `pip install nuclia`.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m     44\u001b[0m     )\n\u001b[1;32m     45\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_config[\u001b[39m\"\u001b[39m\u001b[39mLOCAL\u001b[39m\u001b[39m\"\u001b[39m] \u001b[39m=\u001b[39m local\n\u001b[1;32m     46\u001b[0m zone \u001b[39m=\u001b[39m os\u001b[39m.\u001b[39menviron\u001b[39m.\u001b[39mget(\u001b[39m\"\u001b[39m\u001b[39mNUCLIA_ZONE\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39m\"\u001b[39m\u001b[39meurope-1\u001b[39m\u001b[39m\"\u001b[39m)\n",
+      "\u001b[0;31mValueError\u001b[0m: nuclia python package not found. Please install it with `pip install nuclia`."
+     ]
+    }
+   ],
+   "source": [
+    "from langchain.vectorstores.nucliadb import NucliaDB\n",
+    "API_KEY = \"YOUR_API_KEY\"\n",
+    "\n",
+    "ndb = NucliaDB(knowledge_box=\"YOUR_KB_ID\", local=False, api_key=API_KEY)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Usage with a local instance\n",
+    "\n",
+    "Note: By default `backend` is set to `http://localhost:8080`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.vectorstores.nucliadb import NucliaDB\n",
+    "\n",
+    "ndb = NucliaDB(knowledge_box=\"YOUR_KB_ID\", local=True, backend=\"http://my-local-server\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Add and delete texts to your Knowledge Box"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ids = ndb.add_texts([\"This is a new test\", \"This is a second test\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ndb.delete(ids=ids)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Search in your Knowledge Box"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = ndb.similarity_search(\"Who was inspired by Ada Lovelace?\")\n",
+    "print(res.page_content)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "langchain",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.5"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/libs/langchain/langchain/vectorstores/nucliadb.py
+++ b/libs/langchain/langchain/vectorstores/nucliadb.py
@ -0,0 +1,159 @@
+import os
+from typing import Any, Dict, Iterable, List, Optional, Type
+
+from langchain.embeddings.base import Embeddings
+from langchain.schema.document import Document
+from langchain.vectorstores.base import VST, VectorStore
+
+FIELD_TYPES = {
+    "f": "files",
+    "t": "texts",
+    "l": "links",
+}
+
+
+class NucliaDB(VectorStore):
+    """NucliaDB vector store."""
+
+    _config: Dict[str, Any] = {}
+
+    def __init__(
+        self,
+        knowledge_box: str,
+        local: bool,
+        api_key: Optional[str] = None,
+        backend: Optional[str] = None,
+    ) -> None:
+        """Initialize the NucliaDB client.
+
+        Args:
+            knowledge_box: the Knowledge Box id.
+            local: Whether to use a local NucliaDB instance or Nuclia Cloud
+            api_key: A contributor API key for the kb (needed when local is False)
+            backend: The backend url to use when local is True, defaults to
+            http://localhost:8080
+        """
+        try:
+            from nuclia.sdk import NucliaAuth
+        except ImportError:
+            raise ValueError(
+                "nuclia python package not found. "
+                "Please install it with `pip install nuclia`."
+            )
+        self._config["LOCAL"] = local
+        zone = os.environ.get("NUCLIA_ZONE", "europe-1")
+        self._kb = knowledge_box
+        if local:
+            if not backend:
+                backend = "http://localhost:8080"
+            self._config["BACKEND"] = f"{backend}/api/v1"
+            self._config["TOKEN"] = None
+            NucliaAuth().nucliadb(url=backend)
+            NucliaAuth().kb(url=self.kb_url, interactive=False)
+        else:
+            self._config["BACKEND"] = f"https://{zone}.nuclia.cloud/api/v1"
+            self._config["TOKEN"] = api_key
+            NucliaAuth().kb(
+                url=self.kb_url, token=self._config["TOKEN"], interactive=False
+            )
+
+    @property
+    def is_local(self) -> str:
+        return self._config["LOCAL"]
+
+    @property
+    def kb_url(self) -> str:
+        return f"{self._config['BACKEND']}/kb/{self._kb}"
+
+    def add_texts(
+        self,
+        texts: Iterable[str],
+        metadatas: Optional[List[dict]] = None,
+        **kwargs: Any,
+    ) -> List[str]:
+        """Upload texts to NucliaDB"""
+        ids = []
+        from nuclia.sdk import NucliaResource
+
+        factory = NucliaResource()
+        for i, text in enumerate(texts):
+            extra: Dict[str, Any] = {"metadata": ""}
+            if metadatas:
+                extra = {"metadata": metadatas[i]}
+            id = factory.create(
+                texts={"text": {"body": text}},
+                extra=extra,
+                url=self.kb_url,
+                api_key=self._config["TOKEN"],
+            )
+            ids.append(id)
+        return ids
+
+    def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
+        if not ids:
+            return None
+        from nuclia.sdk import NucliaResource
+
+        factory = NucliaResource()
+        results: List[bool] = []
+        for id in ids:
+            try:
+                factory.delete(rid=id, url=self.kb_url, api_key=self._config["TOKEN"])
+                results.append(True)
+            except ValueError:
+                results.append(False)
+        return all(results)
+
+    def similarity_search(
+        self, query: str, k: int = 4, **kwargs: Any
+    ) -> List[Document]:
+        from nuclia.sdk import NucliaSearch
+        from nucliadb_models.search import FindRequest, ResourceProperties
+
+        request = FindRequest(
+            query=query,
+            page_size=k,
+            show=[ResourceProperties.VALUES, ResourceProperties.EXTRA],
+        )
+        search = NucliaSearch()
+        results = search.find(
+            query=request, url=self.kb_url, api_key=self._config["TOKEN"]
+        )
+        paragraphs = []
+        for resource in results.resources.values():
+            for field in resource.fields.values():
+                for paragraph_id, paragraph in field.paragraphs.items():
+                    info = paragraph_id.split("/")
+                    field_type = FIELD_TYPES.get(info[1], None)
+                    field_id = info[2]
+                    if not field_type:
+                        continue
+                    value = getattr(resource.data, field_type, {}).get(field_id, None)
+                    paragraphs.append(
+                        {
+                            "text": paragraph.text,
+                            "metadata": {
+                                "extra": getattr(
+                                    getattr(resource, "extra", {}), "metadata", None
+                                ),
+                                "value": value,
+                            },
+                            "order": paragraph.order,
+                        }
+                    )
+        sorted_paragraphs = sorted(paragraphs, key=lambda x: x["order"])
+        return [
+            Document(page_content=paragraph["text"], metadata=paragraph["metadata"])
+            for paragraph in sorted_paragraphs
+        ]
+
+    @classmethod
+    def from_texts(
+        cls: Type[VST],
+        texts: List[str],
+        embedding: Embeddings,
+        metadatas: Optional[List[dict]] = None,
+        **kwargs: Any,
+    ) -> VST:
+        """Return VectorStore initialized from texts and embeddings."""
+        raise NotImplementedError
--- a/libs/langchain/poetry.lock
+++ b/libs/langchain/poetry.lock
--- a/libs/langchain/tests/integration_tests/vectorstores/test_nucliadb.py
+++ b/libs/langchain/tests/integration_tests/vectorstores/test_nucliadb.py
@ -0,0 +1,98 @@
+from typing import Any
+from unittest import mock
+
+from langchain.vectorstores.nucliadb import NucliaDB
+
+
+class attrdict(dict):
+    def __getitem__(self, key: str) -> Any:
+        value = dict.__getitem__(self, key)
+        return attrdict(value) if isinstance(value, dict) else value
+
+    __getattr__ = __getitem__
+
+
+def FakeCreate(**args: Any) -> Any:
+    def fn(self: Any, **kwargs: Any) -> str:
+        return "fake_uuid"
+
+    return fn
+
+
+def FakeDelete(**args: Any) -> Any:
+    def fn(self: Any, **kwargs: Any) -> None:
+        return None
+
+    return fn
+
+
+def FakeFind(**args: Any) -> Any:
+    def fn(self: Any, **kwargs: Any) -> Any:
+        return attrdict(
+            {
+                "resources": {
+                    "123": attrdict(
+                        {
+                            "fields": {
+                                "456": attrdict(
+                                    {
+                                        "paragraphs": {
+                                            "123/t/text/0-14": attrdict(
+                                                {
+                                                    "text": "This is a test",
+                                                    "order": 0,
+                                                }
+                                            ),
+                                        }
+                                    }
+                                )
+                            },
+                            "data": {
+                                "texts": {
+                                    "text": {
+                                        "body": "This is a test",
+                                    }
+                                }
+                            },
+                            "extra": attrdict({"metadata": {"some": "metadata"}}),
+                        }
+                    )
+                }
+            }
+        )
+
+    return fn
+
+
+def test_add_texts() -> None:
+    with mock.patch(
+        "nuclia.sdk.resource.NucliaResource.create",
+        new_callable=FakeCreate,
+    ):
+        ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY")
+        assert ndb.is_local is False
+        ids = ndb.add_texts(["This is a new test", "This is a second test"])
+        assert len(ids) == 2
+
+
+def test_delete() -> None:
+    with mock.patch(
+        "nuclia.sdk.resource.NucliaResource.delete",
+        new_callable=FakeDelete,
+    ):
+        ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY")
+        success = ndb.delete(["123", "456"])
+        assert success
+
+
+def test_search() -> None:
+    with mock.patch(
+        "nuclia.sdk.search.NucliaSearch.find",
+        new_callable=FakeFind,
+    ):
+        ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY")
+        results = ndb.similarity_search("Who was inspired by Ada Lovelace?")
+        assert len(results) == 1
+        assert results[0].page_content == "This is a test"
+        assert results[0].metadata["extra"]["some"] == "metadata"
+        assert results[0].metadata["value"]["body"] == "This is a test"