mirror of
https://github.com/hwchase17/langchain
synced 2024-11-11 19:11:02 +00:00
ed58eeb9c5
Moved the following modules to new package langchain-community in a backwards compatible fashion: ``` mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community mv langchain/langchain/adapters community/langchain_community mv langchain/langchain/callbacks community/langchain_community/callbacks mv langchain/langchain/chat_loaders community/langchain_community mv langchain/langchain/chat_models community/langchain_community mv langchain/langchain/document_loaders community/langchain_community mv langchain/langchain/docstore community/langchain_community mv langchain/langchain/document_transformers community/langchain_community mv langchain/langchain/embeddings community/langchain_community mv langchain/langchain/graphs community/langchain_community mv langchain/langchain/llms community/langchain_community mv langchain/langchain/memory/chat_message_histories community/langchain_community mv langchain/langchain/retrievers community/langchain_community mv langchain/langchain/storage community/langchain_community mv langchain/langchain/tools community/langchain_community mv langchain/langchain/utilities community/langchain_community mv langchain/langchain/vectorstores community/langchain_community mv langchain/langchain/agents/agent_toolkits community/langchain_community mv langchain/langchain/cache.py community/langchain_community ``` Moved the following to core ``` mv langchain/langchain/utils/json_schema.py core/langchain_core/utils mv langchain/langchain/utils/html.py core/langchain_core/utils mv langchain/langchain/utils/strings.py core/langchain_core/utils cat langchain/langchain/utils/env.py >> core/langchain_core/utils/env.py rm langchain/langchain/utils/env.py ``` See .scripts/community_split/script_integrations.sh for all changes
160 lines
5.3 KiB
Python
160 lines
5.3 KiB
Python
import os
|
|
from typing import Any, Dict, Iterable, List, Optional, Type
|
|
|
|
from langchain_core.documents import Document
|
|
from langchain_core.embeddings import Embeddings
|
|
from langchain_core.vectorstores import VST, VectorStore
|
|
|
|
FIELD_TYPES = {
|
|
"f": "files",
|
|
"t": "texts",
|
|
"l": "links",
|
|
}
|
|
|
|
|
|
class NucliaDB(VectorStore):
|
|
"""NucliaDB vector store."""
|
|
|
|
_config: Dict[str, Any] = {}
|
|
|
|
def __init__(
|
|
self,
|
|
knowledge_box: str,
|
|
local: bool,
|
|
api_key: Optional[str] = None,
|
|
backend: Optional[str] = None,
|
|
) -> None:
|
|
"""Initialize the NucliaDB client.
|
|
|
|
Args:
|
|
knowledge_box: the Knowledge Box id.
|
|
local: Whether to use a local NucliaDB instance or Nuclia Cloud
|
|
api_key: A contributor API key for the kb (needed when local is False)
|
|
backend: The backend url to use when local is True, defaults to
|
|
http://localhost:8080
|
|
"""
|
|
try:
|
|
from nuclia.sdk import NucliaAuth
|
|
except ImportError:
|
|
raise ValueError(
|
|
"nuclia python package not found. "
|
|
"Please install it with `pip install nuclia`."
|
|
)
|
|
self._config["LOCAL"] = local
|
|
zone = os.environ.get("NUCLIA_ZONE", "europe-1")
|
|
self._kb = knowledge_box
|
|
if local:
|
|
if not backend:
|
|
backend = "http://localhost:8080"
|
|
self._config["BACKEND"] = f"{backend}/api/v1"
|
|
self._config["TOKEN"] = None
|
|
NucliaAuth().nucliadb(url=backend)
|
|
NucliaAuth().kb(url=self.kb_url, interactive=False)
|
|
else:
|
|
self._config["BACKEND"] = f"https://{zone}.nuclia.cloud/api/v1"
|
|
self._config["TOKEN"] = api_key
|
|
NucliaAuth().kb(
|
|
url=self.kb_url, token=self._config["TOKEN"], interactive=False
|
|
)
|
|
|
|
@property
|
|
def is_local(self) -> str:
|
|
return self._config["LOCAL"]
|
|
|
|
@property
|
|
def kb_url(self) -> str:
|
|
return f"{self._config['BACKEND']}/kb/{self._kb}"
|
|
|
|
def add_texts(
|
|
self,
|
|
texts: Iterable[str],
|
|
metadatas: Optional[List[dict]] = None,
|
|
**kwargs: Any,
|
|
) -> List[str]:
|
|
"""Upload texts to NucliaDB"""
|
|
ids = []
|
|
from nuclia.sdk import NucliaResource
|
|
|
|
factory = NucliaResource()
|
|
for i, text in enumerate(texts):
|
|
extra: Dict[str, Any] = {"metadata": ""}
|
|
if metadatas:
|
|
extra = {"metadata": metadatas[i]}
|
|
id = factory.create(
|
|
texts={"text": {"body": text}},
|
|
extra=extra,
|
|
url=self.kb_url,
|
|
api_key=self._config["TOKEN"],
|
|
)
|
|
ids.append(id)
|
|
return ids
|
|
|
|
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
|
|
if not ids:
|
|
return None
|
|
from nuclia.sdk import NucliaResource
|
|
|
|
factory = NucliaResource()
|
|
results: List[bool] = []
|
|
for id in ids:
|
|
try:
|
|
factory.delete(rid=id, url=self.kb_url, api_key=self._config["TOKEN"])
|
|
results.append(True)
|
|
except ValueError:
|
|
results.append(False)
|
|
return all(results)
|
|
|
|
def similarity_search(
|
|
self, query: str, k: int = 4, **kwargs: Any
|
|
) -> List[Document]:
|
|
from nuclia.sdk import NucliaSearch
|
|
from nucliadb_models.search import FindRequest, ResourceProperties
|
|
|
|
request = FindRequest(
|
|
query=query,
|
|
page_size=k,
|
|
show=[ResourceProperties.VALUES, ResourceProperties.EXTRA],
|
|
)
|
|
search = NucliaSearch()
|
|
results = search.find(
|
|
query=request, url=self.kb_url, api_key=self._config["TOKEN"]
|
|
)
|
|
paragraphs = []
|
|
for resource in results.resources.values():
|
|
for field in resource.fields.values():
|
|
for paragraph_id, paragraph in field.paragraphs.items():
|
|
info = paragraph_id.split("/")
|
|
field_type = FIELD_TYPES.get(info[1], None)
|
|
field_id = info[2]
|
|
if not field_type:
|
|
continue
|
|
value = getattr(resource.data, field_type, {}).get(field_id, None)
|
|
paragraphs.append(
|
|
{
|
|
"text": paragraph.text,
|
|
"metadata": {
|
|
"extra": getattr(
|
|
getattr(resource, "extra", {}), "metadata", None
|
|
),
|
|
"value": value,
|
|
},
|
|
"order": paragraph.order,
|
|
}
|
|
)
|
|
sorted_paragraphs = sorted(paragraphs, key=lambda x: x["order"])
|
|
return [
|
|
Document(page_content=paragraph["text"], metadata=paragraph["metadata"])
|
|
for paragraph in sorted_paragraphs
|
|
]
|
|
|
|
@classmethod
|
|
def from_texts(
|
|
cls: Type[VST],
|
|
texts: List[str],
|
|
embedding: Embeddings,
|
|
metadatas: Optional[List[dict]] = None,
|
|
**kwargs: Any,
|
|
) -> VST:
|
|
"""Return VectorStore initialized from texts and embeddings."""
|
|
raise NotImplementedError
|