fix git loader

main
Gustav von Zitzewitz 1 year ago
parent 5c25f3f4f7
commit 3dfff5a41d

4
.gitignore vendored

@ -1,3 +1,5 @@
data
__pycache__
.env
.env
.ipynb_checkpoints
testing.ipynb

@ -122,6 +122,7 @@ if clear_button:
st.session_state["generated"] = []
st.session_state["chat_history"] = []
# file upload and data source inputs
uploaded_file = st.file_uploader("Upload a file")
data_source = st.text_input(
@ -129,7 +130,6 @@ data_source = st.text_input(
placeholder="Any path or url pointing to a file or directory of files",
)
# generate new chain for new data source / uploaded file
# make sure to do this only once per input / on change
if data_source and data_source != st.session_state["data_source"]:
@ -145,6 +145,7 @@ if uploaded_file and uploaded_file != st.session_state["uploaded_file"]:
update_chain()
delete_uploaded_file(uploaded_file)
# container for chat history
response_container = st.container()
# container for text box

@ -12,7 +12,7 @@ MAX_TOKENS = 3357
ENABLE_ADVANCED_OPTIONS = True
DATA_PATH = Path.cwd() / "data"
DEFAULT_DATA_SOURCE = "git@github.com:gustavz/DataChad.git"
DEFAULT_DATA_SOURCE = "https://github.com/gustavz/DataChad.git"
REPO_URL = "https://github.com/gustavz/DataChad"

@ -197,39 +197,41 @@ def delete_uploaded_file(uploaded_file: UploadedFile) -> None:
def handle_load_error(e: str = None) -> None:
e = e or "No Loader found for your data source. Consider contributing:  {REPO_URL}!"
error_msg = f"Failed to load {st.session_state['data_source']} with Error:\n{e}"
error_msg = f"Failed to load '{st.session_state['data_source']}':\n\n{e}"
st.error(error_msg, icon=PAGE_ICON)
logger.info(error_msg)
logger.error(error_msg)
st.stop()
def load_git(data_source: str, chunk_size: int = CHUNK_SIZE) -> List[Document]:
# We need to try both common main branches
# Thank you github for the "master" to "main" switch
# we need to make sure the data path exists
if not os.path.exists(DATA_PATH):
os.makedirs(DATA_PATH)
repo_name = data_source.split("/")[-1].split(".")[0]
repo_path = str(DATA_PATH / repo_name)
clone_url = data_source
if os.path.exists(repo_path):
clone_url = None
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size, chunk_overlap=0
)
branches = ["main", "master"]
for branch in branches:
if os.path.exists(repo_path):
data_source = None
try:
docs = GitLoader(repo_path, data_source, branch).load_and_split(
text_splitter
)
docs = GitLoader(repo_path, clone_url, branch).load_and_split(text_splitter)
break
except Exception as e:
logger.info(f"Error loading git: {e}")
logger.error(f"Error loading git: {e}")
if os.path.exists(repo_path):
# cleanup repo afterwards
shutil.rmtree(repo_path)
try:
return docs
except Exception as e:
handle_load_error()
except:
msg = "Make sure to use HTTPS git repo links"
handle_load_error(msg)
def load_any_data_source(
@ -285,7 +287,12 @@ def load_any_data_source(
logger.info(f"Loaded: {len(docs)} document chucks")
return docs
except Exception as e:
handle_load_error(e if loader else None)
msg = (
e
if loader
else f"No Loader found for your data source. Consider contributing:  {REPO_URL}!"
)
handle_load_error(msg)
def clean_data_source_string(data_source_string: str) -> str:
@ -378,7 +385,7 @@ def update_chain() -> None:
)
st.session_state["chat_history"] = []
except Exception as e:
msg = f"Failed to build chain for data source {st.session_state['data_source']} with error: {e}"
msg = f"Failed to build chain for data source '{st.session_state['data_source']}' with error: {e}"
logger.error(msg)
st.error(msg, icon=PAGE_ICON)

Loading…
Cancel
Save