diff --git a/datachad/database.py b/datachad/database.py index 3ff04b4..8f4a3ba 100644 --- a/datachad/database.py +++ b/datachad/database.py @@ -1,4 +1,3 @@ -import os import re import deeplake @@ -16,12 +15,14 @@ def get_dataset_path() -> str: # to get a string that can be used to create a new dataset dataset_name = re.sub(r"\W+", "-", st.session_state["data_source"]) dataset_name = re.sub(r"--+", "- ", dataset_name).strip("-") + # we need to differntiate between differently chunked datasets + dataset_name += ( + f"-{st.session_state['chunk_size']}-{st.session_state['chunk_overlap']}" + ) if st.session_state["mode"] == MODES.LOCAL: - if not os.path.exists(DATA_PATH): - os.makedirs(DATA_PATH) dataset_path = str(DATA_PATH / dataset_name) else: - dataset_path = f"hub://{st.session_state['activeloop_org_name']}/{dataset_name}-{st.session_state['chunk_size']}" + dataset_path = f"hub://{st.session_state['activeloop_org_name']}/{dataset_name}" return dataset_path diff --git a/datachad/models.py b/datachad/models.py index d90e4ac..6f01fc8 100644 --- a/datachad/models.py +++ b/datachad/models.py @@ -59,33 +59,34 @@ class MODELS(Enum): def get_model() -> BaseLanguageModel: - match st.session_state["model"].name: - case MODELS.GPT35TURBO.name: - model = ChatOpenAI( - model_name=st.session_state["model"].name, - temperature=st.session_state["temperature"], - openai_api_key=st.session_state["openai_api_key"], - ) - case MODELS.GPT4.name: - model = ChatOpenAI( - model_name=st.session_state["model"].name, - temperature=st.session_state["temperature"], - openai_api_key=st.session_state["openai_api_key"], - ) - case MODELS.GPT4ALL.name: - model = GPT4All( - model=st.session_state["model"].path, - n_ctx=st.session_state["model_n_ctx"], - backend="gptj", - temp=st.session_state["temperature"], - verbose=True, - ) - # Added models need to be cased here - case _default: - msg = f"Model {st.session_state['model']} not supported!" - logger.error(msg) - st.error(msg) - exit + with st.spinner("Loading Model..."): + match st.session_state["model"].name: + case MODELS.GPT35TURBO.name: + model = ChatOpenAI( + model_name=st.session_state["model"].name, + temperature=st.session_state["temperature"], + openai_api_key=st.session_state["openai_api_key"], + ) + case MODELS.GPT4.name: + model = ChatOpenAI( + model_name=st.session_state["model"].name, + temperature=st.session_state["temperature"], + openai_api_key=st.session_state["openai_api_key"], + ) + case MODELS.GPT4ALL.name: + model = GPT4All( + model=st.session_state["model"].path, + n_ctx=st.session_state["model_n_ctx"], + backend="gptj", + temp=st.session_state["temperature"], + verbose=True, + ) + # Added models need to be cased here + case _default: + msg = f"Model {st.session_state['model']} not supported!" + logger.error(msg) + st.error(msg) + exit return model diff --git a/requirements.txt b/requirements.txt index 1927023..ee421d8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,8 @@ streamlit==1.22.0 streamlit-chat==0.0.2.2 -deeplake==3.4.1 +deeplake==3.5.2 openai==0.27.6 -langchain==0.0.173 +langchain==0.0.178 tiktoken==0.4.0 unstructured==0.6.5 pdf2image==1.16.3 @@ -11,4 +11,4 @@ beautifulsoup4==4.12.2 bs4==0.0.1 python-dotenv==1.0.0 sentence-transformers==2.2.2 -pygpt4all==1.1.0 \ No newline at end of file +pygpt4all==0.2.3 \ No newline at end of file