diff --git a/.env.template b/.env.template new file mode 100644 index 0000000..8d255d1 --- /dev/null +++ b/.env.template @@ -0,0 +1,3 @@ +OPENAI_API_KEY = your openai key +ACTIVELOOP_TOKEN = your activeloop key +ACTIVELOOP_ORG_NAME = your activeloop organization name \ No newline at end of file diff --git a/.gitignore b/.gitignore index e9be3b6..4f00b71 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ data __pycache__ -.streamlit/secrets.toml \ No newline at end of file +.streamlit/secrets.toml +.env \ No newline at end of file diff --git a/.streamlit/secrets.toml.template b/.streamlit/secrets.toml.template deleted file mode 100644 index 496401a..0000000 --- a/.streamlit/secrets.toml.template +++ /dev/null @@ -1,3 +0,0 @@ -OPENAI_API_KEY = "your openai key" -ACTIVELOOP_TOKEN = "your activeloop key" -ACTIVELOOP_ORG_NAME = "your activeloop organization name" \ No newline at end of file diff --git a/README.md b/README.md index 9372dc1..1c7bc85 100644 --- a/README.md +++ b/README.md @@ -15,9 +15,5 @@ This is an app that let's you ask questions about any data source by leveraging ## Good to know - As default context this git repository is taken so you can directly start asking question about its functionality without chosing an own data source. -- To run locally or deploy somewhere, execute: - - ```cp .streamlit/secret.toml.template .streamlit/secret.toml``` - - and set necessary keys in the newly created secrets file. Another option is to manually set environment variables +- To run locally or deploy somewhere, execute `cp .env.template .env` and set necessary keys in the newly created secrets file. Another option is to manually set environment variables - Yes, Chad in `DataChad` refers to the well-known [meme](https://www.google.com/search?q=chad+meme) diff --git a/app.py b/app.py index 4d7c01b..d192082 100644 --- a/app.py +++ b/app.py @@ -1,16 +1,19 @@ import streamlit as st +from dotenv import load_dotenv from streamlit_chat import message from constants import APP_NAME, DEFAULT_DATA_SOURCE, PAGE_ICON from utils import ( + authenticate, + build_chain_and_clear_history, delete_uploaded_file, generate_response, - save_uploaded_file, - build_chain_and_clear_history, - validate_keys, logger, + save_uploaded_file, ) +load_dotenv() + # Page options and header st.set_option("client.showErrorDetails", True) @@ -33,22 +36,25 @@ if "data_source" not in st.session_state: st.session_state["data_source"] = "" if "uploaded_file" not in st.session_state: st.session_state["uploaded_file"] = None - +if "openai_api_key" not in st.session_state: + st.session_state["openai_api_key"] = None +if "activeloop_token" not in st.session_state: + st.session_state["activeloop_token"] = None +if "activeloop_org_name" not in st.session_state: + st.session_state["activeloop_org_name"] = None # Sidebar with st.sidebar: st.title("Authentication") with st.form("authentication"): - openai_key = st.text_input("OpenAI API Key", type="password", key="openai_key") - activeloop_token = st.text_input( - "ActiveLoop Token", type="password", key="activeloop_token" - ) + openai_api_key = st.text_input("OpenAI API Key", type="password") + activeloop_token = st.text_input("ActiveLoop Token", type="password") activeloop_org_name = st.text_input( - "ActiveLoop Organisation Name", type="password", key="activeloop_org_name" + "ActiveLoop Organisation Name", type="password" ) submitted = st.form_submit_button("Submit") if submitted: - validate_keys(openai_key, activeloop_token, activeloop_org_name) + authenticate(openai_api_key, activeloop_token, activeloop_org_name) if not st.session_state["auth_ok"]: st.stop() diff --git a/requirements.txt b/requirements.txt index ac52191..c408250 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ unstructured==0.6.5 pdf2image==1.16.3 pytesseract==0.3.10 beautifulsoup4==4.12.2 -bs4==0.0.1 \ No newline at end of file +bs4==0.0.1 +python-dotenv==1.0.0 \ No newline at end of file diff --git a/utils.py b/utils.py index 710b9c1..284b6d5 100644 --- a/utils.py +++ b/utils.py @@ -1,11 +1,11 @@ +import logging import os import re - -import logging +import shutil import sys -import openai + import deeplake -import shutil +import openai import streamlit as st from langchain.chains import ConversationalRetrievalChain from langchain.chat_models import ChatOpenAI @@ -27,8 +27,7 @@ from langchain.embeddings.openai import OpenAIEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import DeepLake -from constants import DATA_PATH, MODEL, PAGE_ICON, APP_NAME - +from constants import APP_NAME, DATA_PATH, MODEL, PAGE_ICON logger = logging.getLogger(APP_NAME) @@ -51,55 +50,36 @@ def configure_logger(debug=0): configure_logger(0) -def validate_keys(openai_key, activeloop_token, activeloop_org_name): - # Validate all API related variables are set and correct - all_keys = [openai_key, activeloop_token, activeloop_org_name] - if any(all_keys): - if not all(all_keys): - st.session_state["auth_ok"] = False - st.error("You need to fill all fields", icon=PAGE_ICON) - st.stop() - os.environ["OPENAI_API_KEY"] = openai_key - os.environ["ACTIVELOOP_TOKEN"] = activeloop_token - os.environ["ACTIVELOOP_ORG_NAME"] = activeloop_org_name - else: - # Bypass for local development or deployments with stored credentials - # either env variables or streamlit secrets need to be set - try: - try: - assert os.environ.get("OPENAI_API_KEY") - assert os.environ.get("ACTIVELOOP_TOKEN") - assert os.environ.get("ACTIVELOOP_ORG_NAME") - except: - assert st.secrets.get("OPENAI_API_KEY") - assert st.secrets.get("ACTIVELOOP_TOKEN") - assert st.secrets.get("ACTIVELOOP_ORG_NAME") - - os.environ["OPENAI_API_KEY"] = st.secrets.get("OPENAI_API_KEY") - os.environ["ACTIVELOOP_TOKEN"] = st.secrets.get("ACTIVELOOP_TOKEN") - os.environ["ACTIVELOOP_ORG_NAME"] = st.secrets.get( - "ACTIVELOOP_ORG_NAME" - ) - except: - st.session_state["auth_ok"] = False - st.error("No credentials stored and nothing submitted", icon=PAGE_ICON) - st.stop() +def authenticate(openai_api_key, activeloop_token, activeloop_org_name): + # Validate all credentials are set and correct + # Check for env variables to enable local dev and deployments with shared credentials + openai_api_key = openai_api_key or os.environ.get("OPENAI_API_KEY") + activeloop_token = activeloop_token or os.environ.get("ACTIVELOOP_TOKEN") + activeloop_org_name = activeloop_org_name or os.environ.get("ACTIVELOOP_ORG_NAME") + if not (openai_api_key and activeloop_token and activeloop_org_name): + st.session_state["auth_ok"] = False + st.error("Credentials neither set nor stored", icon=PAGE_ICON) + st.stop() try: # Try to access openai and deeplake with st.spinner("Authentifying..."): - openai.api_key = os.environ["OPENAI_API_KEY"] + openai.api_key = openai_api_key openai.Model.list() deeplake.exists( - f"hub://{os.environ['ACTIVELOOP_ORG_NAME']}/DataChad-Authentication-Check", + f"hub://{activeloop_org_name}/DataChad-Authentication-Check", + token=activeloop_token, ) except Exception as e: logger.error(f"Authentication failed with {e}") st.session_state["auth_ok"] = False st.error("Authentication failed", icon=PAGE_ICON) st.stop() - - logger.info("Authentification successful!") + # store credentials in the session state st.session_state["auth_ok"] = True + st.session_state["openai_api_key"] = openai_api_key + st.session_state["activeloop_token"] = activeloop_token + st.session_state["activeloop_org_name"] = activeloop_org_name + logger.info("Authentification successful!") def save_uploaded_file(uploaded_file): @@ -210,14 +190,19 @@ def clean_data_source_string(data_source): def setup_vector_store(data_source): # either load existing vector store or upload a new one to the hub - embeddings = OpenAIEmbeddings(disallowed_special=()) + embeddings = OpenAIEmbeddings( + disallowed_special=(), openai_api_key=st.session_state["openai_api_key"] + ) data_source_name = clean_data_source_string(data_source) - dataset_path = f"hub://{os.environ['ACTIVELOOP_ORG_NAME']}/{data_source_name}" - if deeplake.exists(dataset_path): + dataset_path = f"hub://{st.session_state['activeloop_org_name']}/{data_source_name}" + if deeplake.exists(dataset_path, token=st.session_state["activeloop_token"]): with st.spinner("Loading vector store..."): logger.info(f"{dataset_path} exists -> loading") vector_store = DeepLake( - dataset_path=dataset_path, read_only=True, embedding_function=embeddings + dataset_path=dataset_path, + read_only=True, + embedding_function=embeddings, + token=st.session_state["activeloop_token"], ) else: with st.spinner("Reading, embedding and uploading data to hub..."): @@ -226,7 +211,8 @@ def setup_vector_store(data_source): vector_store = DeepLake.from_documents( docs, embeddings, - dataset_path=f"hub://{os.environ['ACTIVELOOP_ORG_NAME']}/{data_source_name}", + dataset_path=f"hub://{st.session_state['activeloop_org_name']}/{data_source_name}", + token=st.session_state["activeloop_token"], ) return vector_store @@ -242,7 +228,9 @@ def get_chain(data_source): "k": 10, } retriever.search_kwargs.update(search_kwargs) - model = ChatOpenAI(model_name=MODEL) + model = ChatOpenAI( + model_name=MODEL, openai_api_key=st.session_state["openai_api_key"] + ) with st.spinner("Building langchain..."): chain = ConversationalRetrievalChain.from_llm( model,