You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

53 lines
2.0 KiB
Python

import re
import deeplake
import streamlit as st
from langchain.vectorstores import DeepLake, VectorStore
from datachad.constants import DATA_PATH
from datachad.loader import load_data_source
from datachad.models import MODES, get_embeddings
from datachad.utils import logger
def get_dataset_path() -> str:
# replace all non-word characters with dashes
# to get a string that can be used to create a new dataset
dataset_name = re.sub(r"\W+", "-", st.session_state["data_source"])
dataset_name = re.sub(r"--+", "- ", dataset_name).strip("-")
# we need to differntiate between differently chunked datasets
dataset_name += (
f"-{st.session_state['chunk_size']}-{st.session_state['chunk_overlap']}"
)
if st.session_state["mode"] == MODES.LOCAL:
dataset_path = str(DATA_PATH / dataset_name)
else:
dataset_path = f"hub://{st.session_state['activeloop_org_name']}/{dataset_name}"
return dataset_path
def get_vector_store() -> VectorStore:
# either load existing vector store or upload a new one to the hub
embeddings = get_embeddings()
dataset_path = get_dataset_path()
if deeplake.exists(dataset_path, token=st.session_state["activeloop_token"]):
with st.spinner("Loading vector store..."):
logger.info(f"Dataset '{dataset_path}' exists -> loading")
vector_store = DeepLake(
dataset_path=dataset_path,
read_only=True,
embedding_function=embeddings,
token=st.session_state["activeloop_token"],
)
else:
with st.spinner("Reading, embedding and uploading data to hub..."):
logger.info(f"Dataset '{dataset_path}' does not exist -> uploading")
docs = load_data_source()
vector_store = DeepLake.from_documents(
docs,
embeddings,
dataset_path=dataset_path,
token=st.session_state["activeloop_token"],
)
return vector_store