DataChad/utils.py

import logging
import os
import re
import shutil
import sys

import deeplake
import openai
import streamlit as st
from langchain.chains import ConversationalRetrievalChain
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import (
    CSVLoader,
    DirectoryLoader,
    GitLoader,
    NotebookLoader,
    OnlinePDFLoader,
    PythonLoader,
    TextLoader,
    UnstructuredFileLoader,
    UnstructuredHTMLLoader,
    UnstructuredPDFLoader,
    UnstructuredWordDocumentLoader,
    WebBaseLoader,
)
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import DeepLake

from constants import APP_NAME, DATA_PATH, MODEL, PAGE_ICON

logger = logging.getLogger(APP_NAME)


def configure_logger(debug=0):
    log_level = logging.DEBUG if debug == 1 else logging.INFO
    logger.setLevel(log_level)

    stream_handler = logging.StreamHandler(stream=sys.stdout)
    stream_handler.setLevel(log_level)

    formatter = logging.Formatter("%(message)s")

    stream_handler.setFormatter(formatter)

    logger.addHandler(stream_handler)
    logger.propagate = False


configure_logger(0)


def authenticate(openai_api_key, activeloop_token, activeloop_org_name):
    # Validate all credentials are set and correct
    # Check for env variables to enable local dev and deployments with shared credentials
    openai_api_key = openai_api_key or os.environ.get("OPENAI_API_KEY")
    activeloop_token = activeloop_token or os.environ.get("ACTIVELOOP_TOKEN")
    activeloop_org_name = activeloop_org_name or os.environ.get("ACTIVELOOP_ORG_NAME")
    if not (openai_api_key and activeloop_token and activeloop_org_name):
        st.session_state["auth_ok"] = False
        st.error("Credentials neither set nor stored", icon=PAGE_ICON)
        st.stop()
    try:
        # Try to access openai and deeplake
        with st.spinner("Authentifying..."):
            openai.api_key = openai_api_key
            openai.Model.list()
            deeplake.exists(
                f"hub://{activeloop_org_name}/DataChad-Authentication-Check",
                token=activeloop_token,
            )
    except Exception as e:
        logger.error(f"Authentication failed with {e}")
        st.session_state["auth_ok"] = False
        st.error("Authentication failed", icon=PAGE_ICON)
        st.stop()
    # store credentials in the session state
    st.session_state["auth_ok"] = True
    st.session_state["openai_api_key"] = openai_api_key
    st.session_state["activeloop_token"] = activeloop_token
    st.session_state["activeloop_org_name"] = activeloop_org_name
    logger.info("Authentification successful!")


def save_uploaded_file(uploaded_file):
    # streamlit uploaded files need to be stored locally
    # before embedded and uploaded to the hub
    if not os.path.exists(DATA_PATH):
        os.makedirs(DATA_PATH)
    file_path = str(DATA_PATH / uploaded_file.name)
    uploaded_file.seek(0)
    file_bytes = uploaded_file.read()
    file = open(file_path, "wb")
    file.write(file_bytes)
    file.close()
    logger.info(f"saved {file_path}")
    return file_path


def delete_uploaded_file(uploaded_file):
    # cleanup locally stored files
    file_path = DATA_PATH / uploaded_file.name
    if os.path.exists(DATA_PATH):
        os.remove(file_path)
        logger.info(f"removed {file_path}")


def load_git(data_source):
    # Thank you github for the "master" to "main" switch
    repo_name = data_source.split("/")[-1].split(".")[0]
    repo_path = str(DATA_PATH / repo_name)
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    branches = ["main", "master"]
    for branch in branches:
        if os.path.exists(repo_path):
            data_source = None
        try:
            docs = GitLoader(repo_path, data_source, branch).load_and_split(
                text_splitter
            )
            break
        except Exception as e:
            logger.error(f"error loading git: {e}")
        if os.path.exists(repo_path):
            # cleanup repo afterwards
            shutil.rmtree(repo_path)
    return docs


def load_any_data_source(data_source):
    # ugly thing that decides how to load data
    is_text = data_source.endswith(".txt")
    is_web = data_source.startswith("http")
    is_pdf = data_source.endswith(".pdf")
    is_csv = data_source.endswith("csv")
    is_html = data_source.endswith(".html")
    is_git = data_source.endswith(".git")
    is_notebook = data_source.endswith(".ipynb")
    is_doc = data_source.endswith(".doc")
    is_py = data_source.endswith(".py")
    is_dir = os.path.isdir(data_source)
    is_file = os.path.isfile(data_source)

    loader = None
    if is_dir:
        loader = DirectoryLoader(data_source, recursive=True)
    if is_git:
        return load_git(data_source)
    if is_web:
        if is_pdf:
            loader = OnlinePDFLoader(data_source)
        else:
            loader = WebBaseLoader(data_source)
    if is_file:
        if is_text:
            loader = TextLoader(data_source)
        elif is_notebook:
            loader = NotebookLoader(data_source)
        elif is_pdf:
            loader = UnstructuredPDFLoader(data_source)
        elif is_html:
            loader = UnstructuredHTMLLoader(data_source)
        elif is_doc:
            loader = UnstructuredWordDocumentLoader(data_source)
        elif is_csv:
            loader = CSVLoader(data_source, encoding="utf-8")
        elif is_py:
            loader = PythonLoader(data_source)
        else:
            loader = UnstructuredFileLoader(data_source)
    if loader:
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
        docs = loader.load_and_split(text_splitter)
        logger.info(f"loaded {len(docs)} document chucks")
        return docs

    error_msg = f"Failed to load {data_source}"
    st.error(error_msg, icon=PAGE_ICON)
    logger.info(error_msg)
    st.stop()


def clean_data_source_string(data_source):
    # replace all non-word characters with dashes
    # to get a string that can be used to create a new dataset
    dashed_string = re.sub(r"\W+", "-", data_source)
    cleaned_string = re.sub(r"--+", "- ", dashed_string).strip("-")
    return cleaned_string


def setup_vector_store(data_source):
    # either load existing vector store or upload a new one to the hub
    embeddings = OpenAIEmbeddings(
        disallowed_special=(), openai_api_key=st.session_state["openai_api_key"]
    )
    data_source_name = clean_data_source_string(data_source)
    dataset_path = f"hub://{st.session_state['activeloop_org_name']}/{data_source_name}"
    if deeplake.exists(dataset_path, token=st.session_state["activeloop_token"]):
        with st.spinner("Loading vector store..."):
            logger.info(f"{dataset_path} exists -> loading")
            vector_store = DeepLake(
                dataset_path=dataset_path,
                read_only=True,
                embedding_function=embeddings,
                token=st.session_state["activeloop_token"],
            )
    else:
        with st.spinner("Reading, embedding and uploading data to hub..."):
            logger.info(f"{dataset_path} does not exist -> uploading")
            docs = load_any_data_source(data_source)
            vector_store = DeepLake.from_documents(
                docs,
                embeddings,
                dataset_path=f"hub://{st.session_state['activeloop_org_name']}/{data_source_name}",
                token=st.session_state["activeloop_token"],
            )
    return vector_store


def get_chain(data_source):
    # create the langchain that will be called to generate responses
    vector_store = setup_vector_store(data_source)
    retriever = vector_store.as_retriever()
    search_kwargs = {
        "distance_metric": "cos",
        "fetch_k": 20,
        "maximal_marginal_relevance": True,
        "k": 10,
    }
    retriever.search_kwargs.update(search_kwargs)
    model = ChatOpenAI(
        model_name=MODEL, openai_api_key=st.session_state["openai_api_key"]
    )
    with st.spinner("Building langchain..."):
        chain = ConversationalRetrievalChain.from_llm(
            model,
            retriever=retriever,
            chain_type="stuff",
            verbose=True,
            max_tokens_limit=3375,
        )
        logger.info(f"{data_source} is ready to go!")
    return chain


def build_chain_and_clear_history(data_source):
    # Get chain and store it in the session state
    # Also delete chat history to not confuse the bot with old context
    st.session_state["chain"] = get_chain(data_source)
    st.session_state["chat_history"] = []


def generate_response(prompt):
    # call the chain to generate responses and add them to the chat history
    with st.spinner("Generating response"):
        response = st.session_state["chain"](
            {"question": prompt, "chat_history": st.session_state["chat_history"]}
        )
        logger.info(f"{response=}")
        st.session_state["chat_history"].append((prompt, response["answer"]))
    return response["answer"]
switch to .env credentials 1 year ago			`import logging`
init commit 1 year ago			`import os`
			`import re`
switch to .env credentials 1 year ago			`import shutil`
switch from print to logger 1 year ago			`import sys`
switch to .env credentials 1 year ago
init commit 1 year ago			`import deeplake`
switch to .env credentials 1 year ago			`import openai`
init commit 1 year ago			`import streamlit as st`
			`from langchain.chains import ConversationalRetrievalChain`
			`from langchain.chat_models import ChatOpenAI`
			`from langchain.document_loaders import (`
			`CSVLoader,`
			`DirectoryLoader,`
			`GitLoader,`
			`NotebookLoader,`
			`OnlinePDFLoader,`
			`PythonLoader,`
			`TextLoader,`
			`UnstructuredFileLoader,`
			`UnstructuredHTMLLoader,`
			`UnstructuredPDFLoader,`
			`UnstructuredWordDocumentLoader,`
			`WebBaseLoader,`
			`)`
			`from langchain.embeddings.openai import OpenAIEmbeddings`
			`from langchain.text_splitter import RecursiveCharacterTextSplitter`
			`from langchain.vectorstores import DeepLake`

switch to .env credentials 1 year ago			`from constants import APP_NAME, DATA_PATH, MODEL, PAGE_ICON`
switch from print to logger 1 year ago
			`logger = logging.getLogger(APP_NAME)`


			`def configure_logger(debug=0):`
			`log_level = logging.DEBUG if debug == 1 else logging.INFO`
			`logger.setLevel(log_level)`

			`stream_handler = logging.StreamHandler(stream=sys.stdout)`
			`stream_handler.setLevel(log_level)`

			`formatter = logging.Formatter("%(message)s")`

			`stream_handler.setFormatter(formatter)`

			`logger.addHandler(stream_handler)`
			`logger.propagate = False`


			`configure_logger(0)`
init commit 1 year ago

switch to .env credentials 1 year ago			`def authenticate(openai_api_key, activeloop_token, activeloop_org_name):`
			`# Validate all credentials are set and correct`
			`# Check for env variables to enable local dev and deployments with shared credentials`
			`openai_api_key = openai_api_key or os.environ.get("OPENAI_API_KEY")`
			`activeloop_token = activeloop_token or os.environ.get("ACTIVELOOP_TOKEN")`
			`activeloop_org_name = activeloop_org_name or os.environ.get("ACTIVELOOP_ORG_NAME")`
			`if not (openai_api_key and activeloop_token and activeloop_org_name):`
			`st.session_state["auth_ok"] = False`
			`st.error("Credentials neither set nor stored", icon=PAGE_ICON)`
			`st.stop()`
add authentification, remove tmp files and fix chat 1 year ago			`try:`
			`# Try to access openai and deeplake`
			`with st.spinner("Authentifying..."):`
switch to .env credentials 1 year ago			`openai.api_key = openai_api_key`
add authentification, remove tmp files and fix chat 1 year ago			`openai.Model.list()`
			`deeplake.exists(`
switch to .env credentials 1 year ago			`f"hub://{activeloop_org_name}/DataChad-Authentication-Check",`
			`token=activeloop_token,`
add authentification, remove tmp files and fix chat 1 year ago			`)`
			`except Exception as e:`
switch from print to logger 1 year ago			`logger.error(f"Authentication failed with {e}")`
add authentification, remove tmp files and fix chat 1 year ago			`st.session_state["auth_ok"] = False`
			`st.error("Authentication failed", icon=PAGE_ICON)`
			`st.stop()`
switch to .env credentials 1 year ago			`# store credentials in the session state`
init commit 1 year ago			`st.session_state["auth_ok"] = True`
switch to .env credentials 1 year ago			`st.session_state["openai_api_key"] = openai_api_key`
			`st.session_state["activeloop_token"] = activeloop_token`
			`st.session_state["activeloop_org_name"] = activeloop_org_name`
			`logger.info("Authentification successful!")`
init commit 1 year ago

			`def save_uploaded_file(uploaded_file):`
add authentification, remove tmp files and fix chat 1 year ago			`# streamlit uploaded files need to be stored locally`
			`# before embedded and uploaded to the hub`
init commit 1 year ago			`if not os.path.exists(DATA_PATH):`
			`os.makedirs(DATA_PATH)`
			`file_path = str(DATA_PATH / uploaded_file.name)`
			`uploaded_file.seek(0)`
			`file_bytes = uploaded_file.read()`
			`file = open(file_path, "wb")`
			`file.write(file_bytes)`
			`file.close()`
switch from print to logger 1 year ago			`logger.info(f"saved {file_path}")`
init commit 1 year ago			`return file_path`


add authentification, remove tmp files and fix chat 1 year ago			`def delete_uploaded_file(uploaded_file):`
			`# cleanup locally stored files`
			`file_path = DATA_PATH / uploaded_file.name`
			`if os.path.exists(DATA_PATH):`
			`os.remove(file_path)`
switch from print to logger 1 year ago			`logger.info(f"removed {file_path}")`
add authentification, remove tmp files and fix chat 1 year ago

init commit 1 year ago			`def load_git(data_source):`
			`# Thank you github for the "master" to "main" switch`
			`repo_name = data_source.split("/")[-1].split(".")[0]`
			`repo_path = str(DATA_PATH / repo_name)`
			`text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)`
			`branches = ["main", "master"]`
			`for branch in branches:`
add authentification, remove tmp files and fix chat 1 year ago			`if os.path.exists(repo_path):`
			`data_source = None`
init commit 1 year ago			`try:`
			`docs = GitLoader(repo_path, data_source, branch).load_and_split(`
			`text_splitter`
			`)`
add authentification, remove tmp files and fix chat 1 year ago			`break`
init commit 1 year ago			`except Exception as e:`
switch from print to logger 1 year ago			`logger.error(f"error loading git: {e}")`
add authentification, remove tmp files and fix chat 1 year ago			`if os.path.exists(repo_path):`
			`# cleanup repo afterwards`
			`shutil.rmtree(repo_path)`
init commit 1 year ago			`return docs`


			`def load_any_data_source(data_source):`
			`# ugly thing that decides how to load data`
			`is_text = data_source.endswith(".txt")`
			`is_web = data_source.startswith("http")`
			`is_pdf = data_source.endswith(".pdf")`
			`is_csv = data_source.endswith("csv")`
			`is_html = data_source.endswith(".html")`
			`is_git = data_source.endswith(".git")`
			`is_notebook = data_source.endswith(".ipynb")`
			`is_doc = data_source.endswith(".doc")`
			`is_py = data_source.endswith(".py")`
			`is_dir = os.path.isdir(data_source)`
			`is_file = os.path.isfile(data_source)`

			`loader = None`
			`if is_dir:`
			`loader = DirectoryLoader(data_source, recursive=True)`
			`if is_git:`
			`return load_git(data_source)`
			`if is_web:`
			`if is_pdf:`
			`loader = OnlinePDFLoader(data_source)`
			`else:`
			`loader = WebBaseLoader(data_source)`
			`if is_file:`
			`if is_text:`
			`loader = TextLoader(data_source)`
			`elif is_notebook:`
			`loader = NotebookLoader(data_source)`
			`elif is_pdf:`
			`loader = UnstructuredPDFLoader(data_source)`
			`elif is_html:`
			`loader = UnstructuredHTMLLoader(data_source)`
			`elif is_doc:`
			`loader = UnstructuredWordDocumentLoader(data_source)`
			`elif is_csv:`
			`loader = CSVLoader(data_source, encoding="utf-8")`
			`elif is_py:`
			`loader = PythonLoader(data_source)`
			`else:`
			`loader = UnstructuredFileLoader(data_source)`
			`if loader:`
			`text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)`
			`docs = loader.load_and_split(text_splitter)`
switch from print to logger 1 year ago			`logger.info(f"loaded {len(docs)} document chucks")`
init commit 1 year ago			`return docs`

			`error_msg = f"Failed to load {data_source}"`
			`st.error(error_msg, icon=PAGE_ICON)`
switch from print to logger 1 year ago			`logger.info(error_msg)`
init commit 1 year ago			`st.stop()`


			`def clean_data_source_string(data_source):`
			`# replace all non-word characters with dashes`
add authentification, remove tmp files and fix chat 1 year ago			`# to get a string that can be used to create a new dataset`
init commit 1 year ago			`dashed_string = re.sub(r"\W+", "-", data_source)`
			`cleaned_string = re.sub(r"--+", "- ", dashed_string).strip("-")`
			`return cleaned_string`


			`def setup_vector_store(data_source):`
add authentification, remove tmp files and fix chat 1 year ago			`# either load existing vector store or upload a new one to the hub`
switch to .env credentials 1 year ago			`embeddings = OpenAIEmbeddings(`
			`disallowed_special=(), openai_api_key=st.session_state["openai_api_key"]`
			`)`
init commit 1 year ago			`data_source_name = clean_data_source_string(data_source)`
switch to .env credentials 1 year ago			`dataset_path = f"hub://{st.session_state['activeloop_org_name']}/{data_source_name}"`
			`if deeplake.exists(dataset_path, token=st.session_state["activeloop_token"]):`
add authentification, remove tmp files and fix chat 1 year ago			`with st.spinner("Loading vector store..."):`
switch from print to logger 1 year ago			`logger.info(f"{dataset_path} exists -> loading")`
add authentification, remove tmp files and fix chat 1 year ago			`vector_store = DeepLake(`
switch to .env credentials 1 year ago			`dataset_path=dataset_path,`
			`read_only=True,`
			`embedding_function=embeddings,`
			`token=st.session_state["activeloop_token"],`
add authentification, remove tmp files and fix chat 1 year ago			`)`
init commit 1 year ago			`else:`
add authentification, remove tmp files and fix chat 1 year ago			`with st.spinner("Reading, embedding and uploading data to hub..."):`
switch from print to logger 1 year ago			`logger.info(f"{dataset_path} does not exist -> uploading")`
add authentification, remove tmp files and fix chat 1 year ago			`docs = load_any_data_source(data_source)`
			`vector_store = DeepLake.from_documents(`
			`docs,`
			`embeddings,`
switch to .env credentials 1 year ago			`dataset_path=f"hub://{st.session_state['activeloop_org_name']}/{data_source_name}",`
			`token=st.session_state["activeloop_token"],`
add authentification, remove tmp files and fix chat 1 year ago			`)`
init commit 1 year ago			`return vector_store`


			`def get_chain(data_source):`
			`# create the langchain that will be called to generate responses`
			`vector_store = setup_vector_store(data_source)`
			`retriever = vector_store.as_retriever()`
			`search_kwargs = {`
			`"distance_metric": "cos",`
			`"fetch_k": 20,`
			`"maximal_marginal_relevance": True,`
			`"k": 10,`
			`}`
			`retriever.search_kwargs.update(search_kwargs)`
switch to .env credentials 1 year ago			`model = ChatOpenAI(`
			`model_name=MODEL, openai_api_key=st.session_state["openai_api_key"]`
			`)`
add authentification, remove tmp files and fix chat 1 year ago			`with st.spinner("Building langchain..."):`
			`chain = ConversationalRetrievalChain.from_llm(`
			`model,`
			`retriever=retriever,`
			`chain_type="stuff",`
			`verbose=True,`
			`max_tokens_limit=3375,`
			`)`
switch from print to logger 1 year ago			`logger.info(f"{data_source} is ready to go!")`
init commit 1 year ago			`return chain`


add authentification, remove tmp files and fix chat 1 year ago			`def build_chain_and_clear_history(data_source):`
			`# Get chain and store it in the session state`
			`# Also delete chat history to not confuse the bot with old context`
init commit 1 year ago			`st.session_state["chain"] = get_chain(data_source)`
add authentification, remove tmp files and fix chat 1 year ago			`st.session_state["chat_history"] = []`
init commit 1 year ago

			`def generate_response(prompt):`
			`# call the chain to generate responses and add them to the chat history`
add authentification, remove tmp files and fix chat 1 year ago			`with st.spinner("Generating response"):`
			`response = st.session_state["chain"](`
			`{"question": prompt, "chat_history": st.session_state["chat_history"]}`
			`)`
switch from print to logger 1 year ago			`logger.info(f"{response=}")`
add authentification, remove tmp files and fix chat 1 year ago			`st.session_state["chat_history"].append((prompt, response["answer"]))`
init commit 1 year ago			`return response["answer"]`