Merge branch 'main' into custom-llm

This commit is contained in:
Alex 2023-02-15 14:42:57 +00:00
commit f9fe3f2f48
11 changed files with 219 additions and 32 deletions

44
.github/workflows/ci.yml vendored Normal file
View File

@ -0,0 +1,44 @@
name: Build and push DocsGPT Docker image
on:
workflow_dispatch:
push:
branches:
- main
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v1
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Login to DockerHub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: Login to ghcr.io
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GHCR_TOKEN }}
# Runs a single command using the runners shell
- name: Build and push Docker images to docker.io and ghcr.io
uses: docker/build-push-action@v2
with:
file: './application/Dockerfile'
platforms: linux/amd64
context: ./application
push: true
tags: |
${{ secrets.DOCKER_USERNAME }}/docsgpt:latest
ghcr.io/${{ github.repository_owner }}/docsgpt:latest

1
.gitignore vendored
View File

@ -161,3 +161,4 @@ frontend/*.sw?
application/vectors/
**/inputs

38
CONTRIBUTING.md Normal file
View File

@ -0,0 +1,38 @@
# Welcome to DocsGPT Contributing guideline
Thank you for choosing this project to contribute to, we are all very grateful!
# We accept different types of contributions
📣 Discussions - where you can start a new topic or answer some questions
🐞 Issues - Is how we track tasks, sometimes its bugs that need fixing, sometimes its new features
🛠️ Pull requests - Is how you can suggest changes to our repository, to work on existing issue or to add new features
📚 Wiki - where we have our documentation
## 🐞 Issues and Pull requests
We value contributions to our issues in form of discussion or suggestion, we recommend that you check out existing issues and our [Roadmap](https://github.com/orgs/arc53/projects/2)
If you want to contribute by writing code there are few things that you should know before doing it:
We have frontend (React, Vite) and Backend (python)
### If you are looking to contribute to Frontend (⚛React, Vite):
Current frontend is being migrated from /application to /frontend with a new design, so please contribute to the new on. Check out this [Milestone](https://github.com/arc53/DocsGPT/milestone/1) and its issues also [Figma](https://www.figma.com/file/OXLtrl1EAy885to6S69554/DocsGPT?node-id=0%3A1&t=hjWVuxRg9yi5YkJ9-1)
Please try to follow guidelines
### If you are looking to contribute to Backend (🐍Python):
Check out our issues, and contribute to /application or /scripts (ignore old ingest_rst.py ingest_rst_sphinx.py files, they will be deprecated soon)
Currently we don't have any tests(which would be useful😉) but before submitting you PR make sure that after you ingested some test data its queryable
### Workflow:
Create a fork, make changes on your forked repository, submit changes in a form of pull request
## Questions / collaboration
Please join our [Discord](https://discord.gg/n5BX8dh8rU) don't hesitate, we are very friendly and welcoming to new contributors.
# Thank you so much for considering to contribute to DocsGPT!🙏

View File

@ -57,7 +57,7 @@ Copy .env_sample and create .env with your openai api token
## [Guides](https://github.com/arc53/docsgpt/wiki)
## [Interested in contributing?](https://github.com/arc53/DocsGPT/blob/main/CONTRIBUTING.md)
## [How to use any other documentation](https://github.com/arc53/docsgpt/wiki/How-to-train-on-other-documentation)

View File

@ -5,6 +5,7 @@ import datetime
from flask import Flask, request, render_template
# os.environ["LANGCHAIN_HANDLER"] = "langchain"
import faiss
from langchain import FAISS
from langchain import OpenAI, VectorDBQA, HuggingFaceHub, Cohere
from langchain.chains.question_answering import load_qa_chain
@ -77,6 +78,7 @@ def api_answer():
c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template)
# create a chain with the prompt template and the store
#llm = ManifestWrapper(client=manifest, llm_kwargs={"temperature": 0.001, "max_tokens": 2048})
llm = OpenAI(openai_api_key=api_key, temperature=0)
#llm = HuggingFaceHub(repo_id="bigscience/bloom", huggingfacehub_api_token=api_key)

View File

@ -45,6 +45,7 @@ pytz==2022.7.1
PyYAML==6.0
regex==2022.10.31
requests==2.28.2
retry==0.9.2
six==1.16.0
snowballstemmer==2.2.0
Sphinx==6.1.3
@ -60,6 +61,7 @@ tiktoken==0.1.2
tokenizers==0.13.2
tqdm==4.64.1
transformers==4.26.0
typer==0.7.0
typing-inspect==0.8.0
typing_extensions==4.4.0
urllib3==1.26.14

View File

@ -1,6 +1,11 @@
from collections import defaultdict
import os
import sys
import nltk
import dotenv
import typer
from typing import List, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
@ -10,28 +15,69 @@ from parser.open_ai_func import call_openai_api, get_user_permission
dotenv.load_dotenv()
#Specify your folder HERE
directory_to_ingest = 'inputs'
app = typer.Typer(add_completion=False)
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
#Splits all files in specified folder to documents
raw_docs = SimpleDirectoryReader(input_dir=directory_to_ingest).load_data()
raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
# Here we split the documents, as needed, into smaller chunks.
# We do this due to the context limits of the LLMs.
text_splitter = RecursiveCharacterTextSplitter()
docs = text_splitter.split_documents(raw_docs)
@app.command()
def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
help="Whether to skip price confirmation"),
dir: Optional[List[str]] = typer.Option(["inputs"],
help="""List of paths to directory for index creation.
E.g. --dir inputs --dir inputs2"""),
file: Optional[List[str]] = typer.Option(None,
help="""File paths to use (Optional; overrides dir).
E.g. --file inputs/1.md --file inputs/2.md"""),
recursive: Optional[bool] = typer.Option(True,
help="Whether to recursively search in subdirectories."),
limit: Optional[int] = typer.Option(None,
help="Maximum number of files to read."),
formats: Optional[List[str]] = typer.Option([".rst", ".md"],
help="""List of required extensions (list with .)
Currently supported: .rst, .md, .pdf, .docx, .csv, .epub"""),
exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles).")):
# Here we check for command line arguments for bot calls.
# If no argument exists or the permission_bypass_flag argument is not '-y',
# user permission is requested to call the API.
if len(sys.argv) > 1:
permission_bypass_flag = sys.argv[1]
if permission_bypass_flag == '-y':
call_openai_api(docs)
else:
get_user_permission(docs)
else:
get_user_permission(docs)
"""
Creates index from specified location or files.
By default /inputs folder is used, .rst and .md are parsed.
"""
def process_one_docs(directory, folder_name):
raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=file, recursive=recursive,
required_exts=formats, num_files_limit=limit,
exclude_hidden=exclude).load_data()
raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
print(raw_docs)
# Here we split the documents, as needed, into smaller chunks.
# We do this due to the context limits of the LLMs.
text_splitter = RecursiveCharacterTextSplitter()
docs = text_splitter.split_documents(raw_docs)
# Here we check for command line arguments for bot calls.
# If no argument exists or the yes is not True, then the
# user permission is requested to call the API.
if len(sys.argv) > 1:
if yes:
call_openai_api(docs, folder_name)
else:
get_user_permission(docs, folder_name)
else:
get_user_permission(docs, folder_name)
folder_counts = defaultdict(int)
folder_names = []
for dir_path in dir:
folder_name = os.path.basename(os.path.normpath(dir_path))
folder_counts[folder_name] += 1
if folder_counts[folder_name] > 1:
folder_name = f"{folder_name}_{folder_counts[folder_name]}"
folder_names.append(folder_name)
for directory, folder_name in zip(dir, folder_names):
process_one_docs(directory, folder_name)
if __name__ == "__main__":
app()

View File

@ -29,6 +29,18 @@ def convert_rst_to_txt(src_dir, dst_dir):
f"-D source_suffix=.rst " \
f"-C {dst_dir} "
sphinx_main(args.split())
elif file.endswith(".md"):
# Rename the .md file to .rst file
src_file = os.path.join(root, file)
dst_file = os.path.join(root, file.replace(".md", ".rst"))
os.rename(src_file, dst_file)
# Convert the .rst file to .txt file using sphinx-build
args = f". -b text -D extensions=sphinx.ext.autodoc " \
f"-D master_doc={dst_file} " \
f"-D source_suffix=.rst " \
f"-C {dst_dir} "
sphinx_main(args.split())
def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost.

View File

@ -24,6 +24,8 @@ class RstParser(BaseParser):
remove_hyperlinks: bool = True,
remove_images: bool = True,
remove_table_excess: bool = True,
remove_interpreters: bool = True,
remove_directives: bool = True,
remove_whitespaces_excess: bool = True,
#Be carefull with remove_characters_excess, might cause data loss
remove_characters_excess: bool = True,
@ -34,6 +36,8 @@ class RstParser(BaseParser):
self._remove_hyperlinks = remove_hyperlinks
self._remove_images = remove_images
self._remove_table_excess = remove_table_excess
self._remove_interpreters = remove_interpreters
self._remove_directives = remove_directives
self._remove_whitespaces_excess = remove_whitespaces_excess
self._remove_characters_excess = remove_characters_excess
@ -95,6 +99,18 @@ class RstParser(BaseParser):
content = re.sub(pattern, r"\1", content)
return content
def remove_directives(self, content: str) -> str:
"""Removes reStructuredText Directives"""
pattern = r"`\.\.([^:]+)::"
content = re.sub(pattern, "", content)
return content
def remove_interpreters(self, content: str) -> str:
"""Removes reStructuredText Interpreted Text Roles"""
pattern = r":(\w+):"
content = re.sub(pattern, "", content)
return content
def remove_table_excess(self, content: str) -> str:
"""Pattern to remove grid table separators"""
pattern = r"^\+[-]+\+[-]+\+$"
@ -129,6 +145,10 @@ class RstParser(BaseParser):
content = self.remove_images(content)
if self._remove_table_excess:
content = self.remove_table_excess(content)
if self._remove_directives:
content = self.remove_directives(content)
if self._remove_interpreters:
content = self.remove_interpreters(content)
rst_tups = self.rst_to_tups(content)
if self._remove_whitespaces_excess:
rst_tups = self.remove_whitespaces_excess(rst_tups)

View File

@ -1,10 +1,15 @@
import os
import faiss
import pickle
import tiktoken
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
#from langchain.embeddings import HuggingFaceEmbeddings
from retry import retry
def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost.
@ -13,8 +18,17 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int:
total_price = ((num_tokens/1000) * 0.0004)
return num_tokens, total_price
def call_openai_api(docs):
@retry(tries=10, delay=60)
def store_add_texts_with_retry(store, i):
store.add_texts([i.page_content], metadatas=[i.metadata])
def call_openai_api(docs, folder_name):
# Function to create a vector store from the documents and save it to disk.
# create output folder if it doesn't exist
if not os.path.exists(f"outputs/{folder_name}"):
os.makedirs(f"outputs/{folder_name}")
from tqdm import tqdm
docs_test = [docs[0]]
# remove the first element from docs
@ -31,21 +45,29 @@ def call_openai_api(docs):
for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'):
try:
import time
store.add_texts([i.page_content], metadatas=[i.metadata])
store_add_texts_with_retry(store, i)
except Exception as e:
print(e)
print("Error on ", i)
print("Saving progress")
print(f"stopped at {c1} out of {len(docs)}")
store.save_local("outputs")
print("Sleeping for 10 seconds and trying again")
time.sleep(10)
faiss.write_index(store.index, f"outputs/{folder_name}/docs.index")
store_index_bak = store.index
store.index = None
with open(f"outputs/{folder_name}/faiss_store.pkl", "wb") as f:
pickle.dump(store, f)
print("Sleeping for 60 seconds and trying again")
time.sleep(60)
store.index = store_index_bak
store.add_texts([i.page_content], metadatas=[i.metadata])
c1 += 1
store.save_local("outputs")
faiss.write_index(store.index, f"outputs/{folder_name}/docs.index")
store.index = None
with open(f"outputs/{folder_name}/faiss_store.pkl", "wb") as f:
pickle.dump(store, f)
def get_user_permission(docs):
def get_user_permission(docs, folder_name):
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
#docs_content = (" ".join(docs))
@ -61,8 +83,8 @@ def get_user_permission(docs):
#Here we check for user permission before calling the API.
user_input = input("Price Okay? (Y/N) \n").lower()
if user_input == "y":
call_openai_api(docs)
call_openai_api(docs, folder_name)
elif user_input == "":
call_openai_api(docs)
call_openai_api(docs, folder_name)
else:
print("The API was not called. No money was spent.")
print("The API was not called. No money was spent.")