switch to .env credentials

1 year ago · 1e20e055dd
parent 0554c6a402
commit 1e20e055dd
7 changed files with 61 additions and 69 deletions
--- a/.env.template
+++ b/.env.template
@ -0,0 +1,3 @@
 OPENAI_API_KEY = your openai key
 ACTIVELOOP_TOKEN = your activeloop key
 ACTIVELOOP_ORG_NAME = your activeloop organization name
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 data
 __pycache__
-.streamlit/secrets.toml
+.streamlit/secrets.toml
 .env
--- a/.streamlit/secrets.toml.template
+++ b/.streamlit/secrets.toml.template
@ -1,3 +0,0 @@
 OPENAI_API_KEY = "your openai key"
 ACTIVELOOP_TOKEN = "your activeloop key"
 ACTIVELOOP_ORG_NAME = "your activeloop organization name"
--- a/README.md
+++ b/README.md
@ -15,9 +15,5 @@ This is an app that let's you ask questions about any data source by leveraging
 ## Good to know
 - As default context this git repository is taken so you can directly start asking question about its functionality without chosing an own data source.
- To run locally or deploy somewhere, execute:
+- To run locally or deploy somewhere, execute `cp .env.template .env` and set necessary keys in the newly created secrets file. Another option is to manually set environment variables
  ```cp .streamlit/secret.toml.template .streamlit/secret.toml```
  and set necessary keys in the newly created secrets file. Another option is to manually set environment variables
 - Yes, Chad in `DataChad` refers to the well-known [meme](https://www.google.com/search?q=chad+meme)
--- a/app.py
+++ b/app.py
@ -1,16 +1,19 @@
 import streamlit as st
 from dotenv import load_dotenv
 from streamlit_chat import message
 from constants import APP_NAME, DEFAULT_DATA_SOURCE, PAGE_ICON
 from utils import (
    authenticate,
    build_chain_and_clear_history,
    delete_uploaded_file,
    generate_response,
    save_uploaded_file,
    build_chain_and_clear_history,
    validate_keys,
    logger,
    save_uploaded_file,
 )
 load_dotenv()
 # Page options and header
 st.set_option("client.showErrorDetails", True)
@ -33,22 +36,25 @@ if "data_source" not in st.session_state:
    st.session_state["data_source"] = ""
 if "uploaded_file" not in st.session_state:
    st.session_state["uploaded_file"] = None
-
+if "openai_api_key" not in st.session_state:
    st.session_state["openai_api_key"] = None
 if "activeloop_token" not in st.session_state:
    st.session_state["activeloop_token"] = None
 if "activeloop_org_name" not in st.session_state:
    st.session_state["activeloop_org_name"] = None
 # Sidebar
 with st.sidebar:
    st.title("Authentication")
    with st.form("authentication"):
-        openai_key = st.text_input("OpenAI API Key", type="password", key="openai_key")
+        openai_api_key = st.text_input("OpenAI API Key", type="password")
-        activeloop_token = st.text_input(
+        activeloop_token = st.text_input("ActiveLoop Token", type="password")
            "ActiveLoop Token", type="password", key="activeloop_token"
        )
        activeloop_org_name = st.text_input(
-            "ActiveLoop Organisation Name", type="password", key="activeloop_org_name"
+            "ActiveLoop Organisation Name", type="password"
        )
        submitted = st.form_submit_button("Submit")
        if submitted:
-            validate_keys(openai_key, activeloop_token, activeloop_org_name)
+            authenticate(openai_api_key, activeloop_token, activeloop_org_name)
    if not st.session_state["auth_ok"]:
        st.stop()
--- a/requirements.txt
+++ b/requirements.txt
@ -8,4 +8,5 @@ unstructured==0.6.5
 pdf2image==1.16.3
 pytesseract==0.3.10
 beautifulsoup4==4.12.2
-bs4==0.0.1
+bs4==0.0.1
 python-dotenv==1.0.0
--- a/utils.py
+++ b/utils.py
@ -1,11 +1,11 @@
 import logging
 import os
 import re
-
+import shutil
 import logging
 import sys
-import openai
+
 import deeplake
-import shutil
+import openai
 import streamlit as st
 from langchain.chains import ConversationalRetrievalChain
 from langchain.chat_models import ChatOpenAI
@ -27,8 +27,7 @@ from langchain.embeddings.openai import OpenAIEmbeddings
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.vectorstores import DeepLake
-from constants import DATA_PATH, MODEL, PAGE_ICON, APP_NAME
+from constants import APP_NAME, DATA_PATH, MODEL, PAGE_ICON
 logger = logging.getLogger(APP_NAME)
@ -51,55 +50,36 @@ def configure_logger(debug=0):
 configure_logger(0)
-def validate_keys(openai_key, activeloop_token, activeloop_org_name):
+def authenticate(openai_api_key, activeloop_token, activeloop_org_name):
-    # Validate all API related variables are set and correct
+    # Validate all credentials are set and correct
-    all_keys = [openai_key, activeloop_token, activeloop_org_name]
+    # Check for env variables to enable local dev and deployments with shared credentials
-    if any(all_keys):
+    openai_api_key = openai_api_key or os.environ.get("OPENAI_API_KEY")
-        if not all(all_keys):
+    activeloop_token = activeloop_token or os.environ.get("ACTIVELOOP_TOKEN")
-            st.session_state["auth_ok"] = False
+    activeloop_org_name = activeloop_org_name or os.environ.get("ACTIVELOOP_ORG_NAME")
-            st.error("You need to fill all fields", icon=PAGE_ICON)
+    if not (openai_api_key and activeloop_token and activeloop_org_name):
-            st.stop()
+        st.session_state["auth_ok"] = False
-        os.environ["OPENAI_API_KEY"] = openai_key
+        st.error("Credentials neither set nor stored", icon=PAGE_ICON)
-        os.environ["ACTIVELOOP_TOKEN"] = activeloop_token
+        st.stop()
        os.environ["ACTIVELOOP_ORG_NAME"] = activeloop_org_name
    else:
        # Bypass for local development or deployments with stored credentials
        # either env variables or streamlit secrets need to be set
        try:
            try:
                assert os.environ.get("OPENAI_API_KEY")
                assert os.environ.get("ACTIVELOOP_TOKEN")
                assert os.environ.get("ACTIVELOOP_ORG_NAME")
            except:
                assert st.secrets.get("OPENAI_API_KEY")
                assert st.secrets.get("ACTIVELOOP_TOKEN")
                assert st.secrets.get("ACTIVELOOP_ORG_NAME")
                os.environ["OPENAI_API_KEY"] = st.secrets.get("OPENAI_API_KEY")
                os.environ["ACTIVELOOP_TOKEN"] = st.secrets.get("ACTIVELOOP_TOKEN")
                os.environ["ACTIVELOOP_ORG_NAME"] = st.secrets.get(
                    "ACTIVELOOP_ORG_NAME"
                )
        except:
            st.session_state["auth_ok"] = False
            st.error("No credentials stored and nothing submitted", icon=PAGE_ICON)
            st.stop()
    try:
        # Try to access openai and deeplake
        with st.spinner("Authentifying..."):
-            openai.api_key = os.environ["OPENAI_API_KEY"]
+            openai.api_key = openai_api_key
            openai.Model.list()
            deeplake.exists(
-                f"hub://{os.environ['ACTIVELOOP_ORG_NAME']}/DataChad-Authentication-Check",
+                f"hub://{activeloop_org_name}/DataChad-Authentication-Check",
                token=activeloop_token,
            )
    except Exception as e:
        logger.error(f"Authentication failed with {e}")
        st.session_state["auth_ok"] = False
        st.error("Authentication failed", icon=PAGE_ICON)
        st.stop()
-
+    # store credentials in the session state
    logger.info("Authentification successful!")
    st.session_state["auth_ok"] = True
    st.session_state["openai_api_key"] = openai_api_key
    st.session_state["activeloop_token"] = activeloop_token
    st.session_state["activeloop_org_name"] = activeloop_org_name
    logger.info("Authentification successful!")
 def save_uploaded_file(uploaded_file):
@ -210,14 +190,19 @@ def clean_data_source_string(data_source):
 def setup_vector_store(data_source):
    # either load existing vector store or upload a new one to the hub
-    embeddings = OpenAIEmbeddings(disallowed_special=())
+    embeddings = OpenAIEmbeddings(
        disallowed_special=(), openai_api_key=st.session_state["openai_api_key"]
    )
    data_source_name = clean_data_source_string(data_source)
-    dataset_path = f"hub://{os.environ['ACTIVELOOP_ORG_NAME']}/{data_source_name}"
+    dataset_path = f"hub://{st.session_state['activeloop_org_name']}/{data_source_name}"
-    if deeplake.exists(dataset_path):
+    if deeplake.exists(dataset_path, token=st.session_state["activeloop_token"]):
        with st.spinner("Loading vector store..."):
            logger.info(f"{dataset_path} exists -> loading")
            vector_store = DeepLake(
-                dataset_path=dataset_path, read_only=True, embedding_function=embeddings
+                dataset_path=dataset_path,
                read_only=True,
                embedding_function=embeddings,
                token=st.session_state["activeloop_token"],
            )
    else:
        with st.spinner("Reading, embedding and uploading data to hub..."):
@ -226,7 +211,8 @@ def setup_vector_store(data_source):
            vector_store = DeepLake.from_documents(
                docs,
                embeddings,
-                dataset_path=f"hub://{os.environ['ACTIVELOOP_ORG_NAME']}/{data_source_name}",
+                dataset_path=f"hub://{st.session_state['activeloop_org_name']}/{data_source_name}",
                token=st.session_state["activeloop_token"],
            )
    return vector_store
@ -242,7 +228,9 @@ def get_chain(data_source):
        "k": 10,
    }
    retriever.search_kwargs.update(search_kwargs)
-    model = ChatOpenAI(model_name=MODEL)
+    model = ChatOpenAI(
        model_name=MODEL, openai_api_key=st.session_state["openai_api_key"]
    )
    with st.spinner("Building langchain..."):
        chain = ConversationalRetrievalChain.from_llm(
            model,