@ -166,7 +166,7 @@ class NeuralDBVectorStore(VectorStore):
offset = self . db . _savable_state . documents . get_source_by_id ( source_id ) [ 1 ]
offset = self . db . _savable_state . documents . get_source_by_id ( source_id ) [ 1 ]
return [ str ( offset + i ) for i in range ( len ( texts ) ) ] # type: ignore[arg-type]
return [ str ( offset + i ) for i in range ( len ( texts ) ) ] # type: ignore[arg-type]
@root_validator ( )
@root_validator ( allow_reuse = True )
def validate_environments ( cls , values : Dict ) - > Dict :
def validate_environments ( cls , values : Dict ) - > Dict :
""" Validate ThirdAI environment variables. """
""" Validate ThirdAI environment variables. """
values [ " thirdai_key " ] = convert_to_secret_str (
values [ " thirdai_key " ] = convert_to_secret_str (
@ -314,3 +314,161 @@ class NeuralDBVectorStore(VectorStore):
path : path on disk to save the NeuralDB instance to .
path : path on disk to save the NeuralDB instance to .
"""
"""
self . db . save ( path )
self . db . save ( path )
class NeuralDBClientVectorStore ( VectorStore ) :
""" Vectorstore that uses ThirdAI ' s NeuralDB Enterprise Python Client for NeuralDBs.
To use , you should have the ` ` thirdai [ neural_db ] ` ` python package installed .
Example :
. . code - block : : python
from langchain_community . vectorstores import NeuralDBClientVectorStore
from thirdai . neural_db import ModelBazaar , NeuralDBClient
bazaar = ModelBazaar ( base_url = " http:// {NEURAL_DB_ENTERPRISE_IP} /api/ " )
bazaar . log_in ( email = " user@thirdai.com " , password = " 1234 " )
ndb_client = NeuralDBClient (
deployment_identifier = " user/model-0:user/deployment-0 " ,
base_url = " http:// {NEURAL_DB_ENTERPRISE_IP} /api/ " ,
bazaar = bazaar
)
vectorstore = NeuralDBClientVectorStore ( db = ndb_client )
retriever = vectorstore . as_retriever ( search_kwargs = { ' k ' : 5 } )
"""
def __init__ ( self , db : Any ) - > None :
self . db = db
db : Any = None #: :meta private:
""" NeuralDB Client instance """
class Config :
""" Configuration for this pydantic object. """
extra = Extra . forbid
underscore_attrs_are_private = True
def similarity_search (
self , query : str , k : int = 10 , * * kwargs : Any
) - > List [ Document ] :
""" Retrieve {k} contexts with for a given query
Args :
query : Query to submit to the model
k : The max number of context results to retrieve . Defaults to 10.
"""
try :
references = self . db . search ( query = query , top_k = k , * * kwargs ) [ " references " ]
return [
Document (
page_content = ref [ " text " ] ,
metadata = {
" id " : ref [ " id " ] ,
" source " : ref [ " source " ] ,
" metadata " : ref [ " metadata " ] ,
" score " : ref [ " source " ] ,
" context " : ref [ " context " ] ,
} ,
)
for ref in references
]
except Exception as e :
raise ValueError ( f " Error while retrieving documents: { e } " ) from e
def insert ( self , documents : List [ Dict [ str , Any ] ] ) : # type: ignore[no-untyped-def, no-untyped-def]
"""
Inserts documents into the VectorStore and return the corresponding Sources .
Args :
documents ( List [ Dict [ str , Any ] ] ) : A list of dictionaries that
represent documents to be inserted to the VectorStores .
The document dictionaries must be in the following format :
{ " document_type " : " DOCUMENT_TYPE " , * * kwargs } where " DOCUMENT_TYPE "
is one of the following :
" PDF " , " CSV " , " DOCX " , " URL " , " SentenceLevelPDF " , " SentenceLevelDOCX " ,
" Unstructured " , " InMemoryText " .
The kwargs for each document type are shown below :
class PDF ( Document ) :
document_type : Literal [ " PDF " ]
path : str
metadata : Optional [ dict [ str , Any ] ] = None
on_disk : bool = False
version : str = " v1 "
chunk_size : int = 100
stride : int = 40
emphasize_first_words : int = 0
ignore_header_footer : bool = True
ignore_nonstandard_orientation : bool = True
class CSV ( Document ) :
document_type : Literal [ " CSV " ]
path : str
id_column : Optional [ str ] = None
strong_columns : Optional [ List [ str ] ] = None
weak_columns : Optional [ List [ str ] ] = None
reference_columns : Optional [ List [ str ] ] = None
save_extra_info : bool = True
metadata : Optional [ dict [ str , Any ] ] = None
has_offset : bool = False
on_disk : bool = False
class DOCX ( Document ) :
document_type : Literal [ " DOCX " ]
path : str
metadata : Optional [ dict [ str , Any ] ] = None
on_disk : bool = False
class URL ( Document ) :
document_type : Literal [ " URL " ]
url : str
save_extra_info : bool = True
title_is_strong : bool = False
metadata : Optional [ dict [ str , Any ] ] = None
on_disk : bool = False
class SentenceLevelPDF ( Document ) :
document_type : Literal [ " SentenceLevelPDF " ]
path : str
metadata : Optional [ dict [ str , Any ] ] = None
on_disk : bool = False
class SentenceLevelDOCX ( Document ) :
document_type : Literal [ " SentenceLevelDOCX " ]
path : str
metadata : Optional [ dict [ str , Any ] ] = None
on_disk : bool = False
class Unstructured ( Document ) :
document_type : Literal [ " Unstructured " ]
path : str
save_extra_info : bool = True
metadata : Optional [ dict [ str , Any ] ] = None
on_disk : bool = False
class InMemoryText ( Document ) :
document_type : Literal [ " InMemoryText " ]
name : str
texts : list [ str ]
metadatas : Optional [ list [ dict [ str , Any ] ] ] = None
global_metadata : Optional [ dict [ str , Any ] ] = None
on_disk : bool = False
For Document types with the arg " path " , ensure that
the path exists on your local machine .
"""
return self . db . insert ( documents )
def remove_documents ( self , source_ids : List [ str ] ) : # type: ignore[no-untyped-def]
"""
Deletes documents from the VectorStore using source ids .
Args :
files ( List [ str ] ) : A list of source ids to delete from the VectorStore .
"""
self . db . delete ( source_ids )