langchain/libs/community/langchain_community/vectorstores/nucliadb.py

import os
from typing import Any, Dict, Iterable, List, Optional, Type

from langchain_core.documents import Document
from langchain_core.embeddings import Embeddings
from langchain_core.vectorstores import VST, VectorStore

FIELD_TYPES = {
    "f": "files",
    "t": "texts",
    "l": "links",
}


class NucliaDB(VectorStore):
    """NucliaDB vector store."""

    _config: Dict[str, Any] = {}

    def __init__(
        self,
        knowledge_box: str,
        local: bool,
        api_key: Optional[str] = None,
        backend: Optional[str] = None,
    ) -> None:
        """Initialize the NucliaDB client.

        Args:
            knowledge_box: the Knowledge Box id.
            local: Whether to use a local NucliaDB instance or Nuclia Cloud
            api_key: A contributor API key for the kb (needed when local is False)
            backend: The backend url to use when local is True, defaults to
            http://localhost:8080
        """
        try:
            from nuclia.sdk import NucliaAuth
        except ImportError:
            raise ValueError(
                "nuclia python package not found. "
                "Please install it with `pip install nuclia`."
            )
        self._config["LOCAL"] = local
        zone = os.environ.get("NUCLIA_ZONE", "europe-1")
        self._kb = knowledge_box
        if local:
            if not backend:
                backend = "http://localhost:8080"
            self._config["BACKEND"] = f"{backend}/api/v1"
            self._config["TOKEN"] = None
            NucliaAuth().nucliadb(url=backend)
            NucliaAuth().kb(url=self.kb_url, interactive=False)
        else:
            self._config["BACKEND"] = f"https://{zone}.nuclia.cloud/api/v1"
            self._config["TOKEN"] = api_key
            NucliaAuth().kb(
                url=self.kb_url, token=self._config["TOKEN"], interactive=False
            )

    @property
    def is_local(self) -> str:
        return self._config["LOCAL"]

    @property
    def kb_url(self) -> str:
        return f"{self._config['BACKEND']}/kb/{self._kb}"

    def add_texts(
        self,
        texts: Iterable[str],
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> List[str]:
        """Upload texts to NucliaDB"""
        ids = []
        from nuclia.sdk import NucliaResource

        factory = NucliaResource()
        for i, text in enumerate(texts):
            extra: Dict[str, Any] = {"metadata": ""}
            if metadatas:
                extra = {"metadata": metadatas[i]}
            id = factory.create(
                texts={"text": {"body": text}},
                extra=extra,
                url=self.kb_url,
                api_key=self._config["TOKEN"],
            )
            ids.append(id)
        return ids

    def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
        if not ids:
            return None
        from nuclia.sdk import NucliaResource

        factory = NucliaResource()
        results: List[bool] = []
        for id in ids:
            try:
                factory.delete(rid=id, url=self.kb_url, api_key=self._config["TOKEN"])
                results.append(True)
            except ValueError:
                results.append(False)
        return all(results)

    def similarity_search(
        self, query: str, k: int = 4, **kwargs: Any
    ) -> List[Document]:
        from nuclia.sdk import NucliaSearch
        from nucliadb_models.search import FindRequest, ResourceProperties

        request = FindRequest(
            query=query,
            page_size=k,
            show=[ResourceProperties.VALUES, ResourceProperties.EXTRA],
        )
        search = NucliaSearch()
        results = search.find(
            query=request, url=self.kb_url, api_key=self._config["TOKEN"]
        )
        paragraphs = []
        for resource in results.resources.values():
            for field in resource.fields.values():
                for paragraph_id, paragraph in field.paragraphs.items():
                    info = paragraph_id.split("/")
                    field_type = FIELD_TYPES.get(info[1], None)
                    field_id = info[2]
                    if not field_type:
                        continue
                    value = getattr(resource.data, field_type, {}).get(field_id, None)
                    paragraphs.append(
                        {
                            "text": paragraph.text,
                            "metadata": {
                                "extra": getattr(
                                    getattr(resource, "extra", {}), "metadata", None
                                ),
                                "value": value,
                            },
                            "order": paragraph.order,
                        }
                    )
        sorted_paragraphs = sorted(paragraphs, key=lambda x: x["order"])
        return [
            Document(page_content=paragraph["text"], metadata=paragraph["metadata"])
            for paragraph in sorted_paragraphs
        ]

    @classmethod
    def from_texts(
        cls: Type[VST],
        texts: List[str],
        embedding: Embeddings,
        metadatas: Optional[List[dict]] = None,
        **kwargs: Any,
    ) -> VST:
        """Return VectorStore initialized from texts and embeddings."""
        raise NotImplementedError