mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
b11fd3bedc
- **Description:** some document loaders add integer-type metadata values which cause error - **Issue:** 15937 - **Dependencies:** none --------- Co-authored-by: JY <jyjy@jaguardb>
456 lines
14 KiB
Python
456 lines
14 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
from typing import Any, List, Optional, Tuple
|
|
|
|
from langchain_core.documents import Document
|
|
from langchain_core.embeddings import Embeddings
|
|
from langchain_core.vectorstores import VectorStore
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class Jaguar(VectorStore):
|
|
"""`Jaguar API` vector store.
|
|
|
|
See http://www.jaguardb.com
|
|
See http://github.com/fserv/jaguar-sdk
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
from langchain_community.vectorstores.jaguar import Jaguar
|
|
|
|
vectorstore = Jaguar(
|
|
pod = 'vdb',
|
|
store = 'mystore',
|
|
vector_index = 'v',
|
|
vector_type = 'cosine_fraction_float',
|
|
vector_dimension = 1536,
|
|
url='http://192.168.8.88:8080/fwww/',
|
|
embedding=openai_model
|
|
)
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
pod: str,
|
|
store: str,
|
|
vector_index: str,
|
|
vector_type: str,
|
|
vector_dimension: int,
|
|
url: str,
|
|
embedding: Embeddings,
|
|
):
|
|
self._pod = pod
|
|
self._store = store
|
|
self._vector_index = vector_index
|
|
self._vector_type = vector_type
|
|
self._vector_dimension = vector_dimension
|
|
|
|
self._embedding = embedding
|
|
try:
|
|
from jaguardb_http_client.JaguarHttpClient import JaguarHttpClient
|
|
except ImportError:
|
|
raise ValueError(
|
|
"Could not import jaguardb-http-client python package. "
|
|
"Please install it with `pip install -U jaguardb-http-client`"
|
|
)
|
|
|
|
self._jag = JaguarHttpClient(url)
|
|
self._token = ""
|
|
|
|
def login(
|
|
self,
|
|
jaguar_api_key: Optional[str] = "",
|
|
) -> bool:
|
|
"""
|
|
login to jaguardb server with a jaguar_api_key or let self._jag find a key
|
|
Args:
|
|
pod (str): name of a Pod
|
|
store (str): name of a vector store
|
|
optional jaguar_api_key (str): API key of user to jaguardb server
|
|
Returns:
|
|
True if successful; False if not successful
|
|
"""
|
|
|
|
if jaguar_api_key == "":
|
|
jaguar_api_key = self._jag.getApiKey()
|
|
self._jaguar_api_key = jaguar_api_key
|
|
self._token = self._jag.login(jaguar_api_key)
|
|
if self._token == "":
|
|
logger.error("E0001 error init(): invalid jaguar_api_key")
|
|
return False
|
|
return True
|
|
|
|
def create(
|
|
self,
|
|
metadata_str: str,
|
|
text_size: int,
|
|
) -> None:
|
|
"""
|
|
create the vector store on the backend database
|
|
Args:
|
|
metadata_str (str): columns and their types
|
|
Returns:
|
|
True if successful; False if not successful
|
|
"""
|
|
podstore = self._pod + "." + self._store
|
|
|
|
"""
|
|
source column is required.
|
|
v:text column is required.
|
|
"""
|
|
q = "create store "
|
|
q += podstore
|
|
q += f" ({self._vector_index} vector({self._vector_dimension},"
|
|
q += f" '{self._vector_type}'),"
|
|
q += f" source char(256), v:text char({text_size}),"
|
|
q += metadata_str + ")"
|
|
self.run(q)
|
|
|
|
def run(self, query: str, withFile: bool = False) -> dict:
|
|
"""
|
|
Run any query statement in jaguardb
|
|
Args:
|
|
query (str): query statement to jaguardb
|
|
Returns:
|
|
None for invalid token, or
|
|
json result string
|
|
"""
|
|
if self._token == "":
|
|
logger.error(f"E0005 error run({query})")
|
|
return {}
|
|
|
|
resp = self._jag.post(query, self._token, withFile)
|
|
txt = resp.text
|
|
try:
|
|
js = json.loads(txt)
|
|
return js
|
|
except Exception:
|
|
return {}
|
|
|
|
@property
|
|
def embeddings(self) -> Optional[Embeddings]:
|
|
return self._embedding
|
|
|
|
def add_texts(
|
|
self,
|
|
texts: List[str],
|
|
metadatas: Optional[List[dict]] = None,
|
|
**kwargs: Any,
|
|
) -> List[str]:
|
|
"""
|
|
Add texts through the embeddings and add to the vectorstore.
|
|
Args:
|
|
texts: list of text strings to add to the jaguar vector store.
|
|
metadatas: Optional list of metadatas associated with the texts.
|
|
[{"m1": "v11", "m2": "v12", "m3": "v13", "filecol": "path_file1.jpg" },
|
|
{"m1": "v21", "m2": "v22", "m3": "v23", "filecol": "path_file2.jpg" },
|
|
{"m1": "v31", "m2": "v32", "m3": "v33", "filecol": "path_file3.jpg" },
|
|
{"m1": "v41", "m2": "v42", "m3": "v43", "filecol": "path_file4.jpg" }]
|
|
kwargs: vector_index=name_of_vector_index
|
|
file_column=name_of_file_column
|
|
|
|
Returns:
|
|
List of ids from adding the texts into the vectorstore
|
|
"""
|
|
vcol = self._vector_index
|
|
filecol = kwargs.get("file_column", "")
|
|
text_tag = kwargs.get("text_tag", "")
|
|
podstorevcol = self._pod + "." + self._store + "." + vcol
|
|
q = "textcol " + podstorevcol
|
|
js = self.run(q)
|
|
if js == "":
|
|
return []
|
|
textcol = js["data"]
|
|
|
|
if text_tag != "":
|
|
tag_texts = []
|
|
for t in texts:
|
|
tag_texts.append(text_tag + " " + t)
|
|
texts = tag_texts
|
|
|
|
embeddings = self._embedding.embed_documents(list(texts))
|
|
ids = []
|
|
if metadatas is None:
|
|
### no meta and no files to upload
|
|
i = 0
|
|
for vec in embeddings:
|
|
str_vec = [str(x) for x in vec]
|
|
values_comma = ",".join(str_vec)
|
|
podstore = self._pod + "." + self._store
|
|
q = "insert into " + podstore + " ("
|
|
q += vcol + "," + textcol + ") values ('" + values_comma
|
|
txt = texts[i].replace("'", "\\'")
|
|
q += "','" + txt + "')"
|
|
js = self.run(q, False)
|
|
ids.append(js["zid"])
|
|
i += 1
|
|
else:
|
|
i = 0
|
|
for vec in embeddings:
|
|
str_vec = [str(x) for x in vec]
|
|
nvec, vvec, filepath = self._parseMeta(metadatas[i], filecol)
|
|
if filecol != "":
|
|
rc = self._jag.postFile(self._token, filepath, 1)
|
|
if not rc:
|
|
return []
|
|
names_comma = ",".join(nvec)
|
|
names_comma += "," + vcol
|
|
## col1,col2,col3,vecl
|
|
values_comma = "'" + "','".join(vvec) + "'"
|
|
### 'va1','val2','val3'
|
|
values_comma += ",'" + ",".join(str_vec) + "'"
|
|
### 'v1,v2,v3'
|
|
podstore = self._pod + "." + self._store
|
|
q = "insert into " + podstore + " ("
|
|
q += names_comma + "," + textcol + ") values (" + values_comma
|
|
txt = texts[i].replace("'", "\\'")
|
|
q += ",'" + txt + "')"
|
|
if filecol != "":
|
|
js = self.run(q, True)
|
|
else:
|
|
js = self.run(q, False)
|
|
ids.append(js["zid"])
|
|
i += 1
|
|
|
|
return ids
|
|
|
|
def similarity_search_with_score(
|
|
self,
|
|
query: str,
|
|
k: int = 3,
|
|
fetch_k: int = -1,
|
|
where: Optional[str] = None,
|
|
args: Optional[str] = None,
|
|
metadatas: Optional[List[str]] = None,
|
|
**kwargs: Any,
|
|
) -> List[Tuple[Document, float]]:
|
|
"""
|
|
Return Jaguar documents most similar to query, along with scores.
|
|
Args:
|
|
query: Text to look up documents similar to.
|
|
k: Number of Documents to return. Defaults to 3.
|
|
lambda_val: lexical match parameter for hybrid search.
|
|
where: the where clause in select similarity. For example a
|
|
where can be "rating > 3.0 and (state = 'NV' or state = 'CA')"
|
|
args: extra options passed to select similarity
|
|
kwargs: vector_index=vcol, vector_type=cosine_fraction_float
|
|
Returns:
|
|
List of Documents most similar to the query and score for each.
|
|
List of Tuples of (doc, similarity_score):
|
|
[ (doc, score), (doc, score), ...]
|
|
"""
|
|
vcol = self._vector_index
|
|
vtype = self._vector_type
|
|
embeddings = self._embedding.embed_query(query)
|
|
str_embeddings = [str(f) for f in embeddings]
|
|
qv_comma = ",".join(str_embeddings)
|
|
podstore = self._pod + "." + self._store
|
|
q = (
|
|
"select similarity("
|
|
+ vcol
|
|
+ ",'"
|
|
+ qv_comma
|
|
+ "','topk="
|
|
+ str(k)
|
|
+ ",fetch_k="
|
|
+ str(fetch_k)
|
|
+ ",type="
|
|
+ vtype
|
|
)
|
|
q += ",with_score=yes,with_text=yes"
|
|
if args is not None:
|
|
q += "," + args
|
|
|
|
if metadatas is not None:
|
|
meta = "&".join(metadatas)
|
|
q += ",metadata=" + meta
|
|
|
|
q += "') from " + podstore
|
|
|
|
if where is not None:
|
|
q += " where " + where
|
|
|
|
jarr = self.run(q)
|
|
if jarr is None:
|
|
return []
|
|
|
|
docs_with_score = []
|
|
for js in jarr:
|
|
score = js["score"]
|
|
text = js["text"]
|
|
zid = js["zid"]
|
|
|
|
### give metadatas
|
|
md = {}
|
|
md["zid"] = zid
|
|
if metadatas is not None:
|
|
for m in metadatas:
|
|
mv = js[m]
|
|
md[m] = mv
|
|
|
|
doc = Document(page_content=text, metadata=md)
|
|
tup = (doc, score)
|
|
docs_with_score.append(tup)
|
|
|
|
return docs_with_score
|
|
|
|
def similarity_search(
|
|
self,
|
|
query: str,
|
|
k: int = 3,
|
|
where: Optional[str] = None,
|
|
metadatas: Optional[List[str]] = None,
|
|
**kwargs: Any,
|
|
) -> List[Document]:
|
|
"""
|
|
Return Jaguar documents most similar to query, along with scores.
|
|
Args:
|
|
query: Text to look up documents similar to.
|
|
k: Number of Documents to return. Defaults to 5.
|
|
where: the where clause in select similarity. For example a
|
|
where can be "rating > 3.0 and (state = 'NV' or state = 'CA')"
|
|
Returns:
|
|
List of Documents most similar to the query
|
|
"""
|
|
docs_and_scores = self.similarity_search_with_score(
|
|
query, k=k, where=where, metadatas=metadatas, **kwargs
|
|
)
|
|
return [doc for doc, _ in docs_and_scores]
|
|
|
|
def is_anomalous(
|
|
self,
|
|
query: str,
|
|
**kwargs: Any,
|
|
) -> bool:
|
|
"""
|
|
Detect if given text is anomalous from the dataset
|
|
Args:
|
|
query: Text to detect if it is anomaly
|
|
Returns:
|
|
True or False
|
|
"""
|
|
vcol = self._vector_index
|
|
vtype = self._vector_type
|
|
embeddings = self._embedding.embed_query(query)
|
|
str_embeddings = [str(f) for f in embeddings]
|
|
qv_comma = ",".join(str_embeddings)
|
|
podstore = self._pod + "." + self._store
|
|
q = "select anomalous(" + vcol + ", '" + qv_comma + "', 'type=" + vtype + "')"
|
|
q += " from " + podstore
|
|
|
|
js = self.run(q)
|
|
if isinstance(js, list) and len(js) == 0:
|
|
return False
|
|
jd = json.loads(js[0])
|
|
if jd["anomalous"] == "YES":
|
|
return True
|
|
return False
|
|
|
|
@classmethod
|
|
def from_texts(
|
|
cls,
|
|
texts: List[str],
|
|
embedding: Embeddings,
|
|
url: str,
|
|
pod: str,
|
|
store: str,
|
|
vector_index: str,
|
|
vector_type: str,
|
|
vector_dimension: int,
|
|
metadatas: Optional[List[dict]] = None,
|
|
jaguar_api_key: Optional[str] = "",
|
|
**kwargs: Any,
|
|
) -> Jaguar:
|
|
jagstore = cls(
|
|
pod, store, vector_index, vector_type, vector_dimension, url, embedding
|
|
)
|
|
jagstore.login(jaguar_api_key)
|
|
jagstore.clear()
|
|
jagstore.add_texts(texts, metadatas, **kwargs)
|
|
return jagstore
|
|
|
|
def clear(self) -> None:
|
|
"""
|
|
Delete all records in jaguardb
|
|
Args: No args
|
|
Returns: None
|
|
"""
|
|
podstore = self._pod + "." + self._store
|
|
q = "truncate store " + podstore
|
|
self.run(q)
|
|
|
|
def delete(self, zids: List[str], **kwargs: Any) -> None:
|
|
"""
|
|
Delete records in jaguardb by a list of zero-ids
|
|
Args:
|
|
pod (str): name of a Pod
|
|
ids (List[str]): a list of zid as string
|
|
Returns:
|
|
Do not return anything
|
|
"""
|
|
podstore = self._pod + "." + self._store
|
|
for zid in zids:
|
|
q = "delete from " + podstore + " where zid='" + zid + "'"
|
|
self.run(q)
|
|
|
|
def count(self) -> int:
|
|
"""
|
|
Count records of a store in jaguardb
|
|
Args: no args
|
|
Returns: (int) number of records in pod store
|
|
"""
|
|
podstore = self._pod + "." + self._store
|
|
q = "select count() from " + podstore
|
|
js = self.run(q)
|
|
if isinstance(js, list) and len(js) == 0:
|
|
return 0
|
|
jd = json.loads(js[0])
|
|
return int(jd["data"])
|
|
|
|
def drop(self) -> None:
|
|
"""
|
|
Drop or remove a store in jaguardb
|
|
Args: no args
|
|
Returns: None
|
|
"""
|
|
podstore = self._pod + "." + self._store
|
|
q = "drop store " + podstore
|
|
self.run(q)
|
|
|
|
def logout(self) -> None:
|
|
"""
|
|
Logout to cleanup resources
|
|
Args: no args
|
|
Returns: None
|
|
"""
|
|
self._jag.logout(self._token)
|
|
|
|
def prt(self, msg: str) -> None:
|
|
with open("/tmp/debugjaguar.log", "a") as file:
|
|
print(f"msg={msg}", file=file, flush=True)
|
|
|
|
def _parseMeta(self, nvmap: dict, filecol: str) -> Tuple[List[str], List[str], str]:
|
|
filepath = ""
|
|
if filecol == "":
|
|
nvec = list(nvmap.keys())
|
|
vvec = list(nvmap.values())
|
|
else:
|
|
nvec = []
|
|
vvec = []
|
|
if filecol in nvmap:
|
|
nvec.append(filecol)
|
|
vvec.append(nvmap[filecol])
|
|
filepath = nvmap[filecol]
|
|
|
|
for k, v in nvmap.items():
|
|
if k != filecol:
|
|
nvec.append(k)
|
|
vvec.append(v)
|
|
|
|
vvec_s = [str(e) for e in vvec]
|
|
return nvec, vvec_s, filepath
|