Merge branch 'arc53:main' into taylor-working

pull/72/head^2
Taylor Svec 2 years ago committed by GitHub
commit 85b2f54829
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,44 @@
name: Build and push DocsGPT Docker image
on:
workflow_dispatch:
push:
branches:
- main
jobs:
deploy:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Set up QEMU
uses: docker/setup-qemu-action@v1
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Login to DockerHub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }}
- name: Login to ghcr.io
uses: docker/login-action@v2
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GHCR_TOKEN }}
# Runs a single command using the runners shell
- name: Build and push Docker images to docker.io and ghcr.io
uses: docker/build-push-action@v2
with:
file: './application/Dockerfile'
platforms: linux/amd64
context: ./application
push: true
tags: |
${{ secrets.DOCKER_USERNAME }}/docsgpt:latest
ghcr.io/${{ github.repository_owner }}/docsgpt:latest

1
.gitignore vendored

@ -161,3 +161,4 @@ frontend/*.sw?
application/vectors/
**/inputs

@ -0,0 +1,38 @@
# Welcome to DocsGPT Contributing guideline
Thank you for choosing this project to contribute to, we are all very grateful!
# We accept different types of contributions
📣 Discussions - where you can start a new topic or answer some questions
🐞 Issues - Is how we track tasks, sometimes its bugs that need fixing, sometimes its new features
🛠️ Pull requests - Is how you can suggest changes to our repository, to work on existing issue or to add new features
📚 Wiki - where we have our documentation
## 🐞 Issues and Pull requests
We value contributions to our issues in form of discussion or suggestion, we recommend that you check out existing issues and our [Roadmap](https://github.com/orgs/arc53/projects/2)
If you want to contribute by writing code there are few things that you should know before doing it:
We have frontend (React, Vite) and Backend (python)
### If you are looking to contribute to Frontend (⚛React, Vite):
Current frontend is being migrated from /application to /frontend with a new design, so please contribute to the new on. Check out this [Milestone](https://github.com/arc53/DocsGPT/milestone/1) and its issues also [Figma](https://www.figma.com/file/OXLtrl1EAy885to6S69554/DocsGPT?node-id=0%3A1&t=hjWVuxRg9yi5YkJ9-1)
Please try to follow guidelines
### If you are looking to contribute to Backend (🐍Python):
Check out our issues, and contribute to /application or /scripts (ignore old ingest_rst.py ingest_rst_sphinx.py files, they will be deprecated soon)
Currently we don't have any tests(which would be useful😉) but before submitting you PR make sure that after you ingested some test data its queryable
### Workflow:
Create a fork, make changes on your forked repository, submit changes in a form of pull request
## Questions / collaboration
Please join our [Discord](https://discord.gg/n5BX8dh8rU) don't hesitate, we are very friendly and welcoming to new contributors.
# Thank you so much for considering to contribute to DocsGPT!🙏

@ -57,7 +57,7 @@ Copy .env_sample and create .env with your openai api token
## [Guides](https://github.com/arc53/docsgpt/wiki)
## [Interested in contributing?](https://github.com/arc53/DocsGPT/blob/main/CONTRIBUTING.md)
## [How to use any other documentation](https://github.com/arc53/docsgpt/wiki/How-to-train-on-other-documentation)

@ -5,8 +5,8 @@ import datetime
from flask import Flask, request, render_template
# os.environ["LANGCHAIN_HANDLER"] = "langchain"
import faiss
from langchain import OpenAI
from langchain.chains import VectorDBQAWithSourcesChain
from langchain import OpenAI, VectorDBQA
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
import requests
@ -69,11 +69,22 @@ def api_answer():
c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template)
# create a chain with the prompt template and the store
chain = VectorDBQAWithSourcesChain.from_llm(llm=OpenAI(openai_api_key=api_key, temperature=0), vectorstore=store, combine_prompt=c_prompt)
#chain = VectorDBQA.from_llm(llm=OpenAI(openai_api_key=api_key, temperature=0), vectorstore=store, combine_prompt=c_prompt)
# chain = VectorDBQA.from_chain_type(llm=OpenAI(openai_api_key=api_key, temperature=0), chain_type='map_reduce',
# vectorstore=store)
qa_chain = load_qa_chain(OpenAI(openai_api_key=api_key, temperature=0), chain_type="map_reduce",
combine_prompt=c_prompt)
chain = VectorDBQA(combine_documents_chain=qa_chain, vectorstore=store)
# fetch the answer
result = chain({"question": question})
result = chain({"query": question})
print(result)
# some formatting for the frontend
result['answer'] = result['result']
result['answer'] = result['answer'].replace("\\n", "<br>")
result['answer'] = result['answer'].replace("SOURCES:", "")
# mock result

@ -60,6 +60,7 @@ tiktoken==0.1.2
tokenizers==0.13.2
tqdm==4.64.1
transformers==4.26.0
typer==0.7.0
typing-inspect==0.8.0
typing_extensions==4.4.0
urllib3==1.26.14

@ -1,6 +1,9 @@
import sys
import nltk
import dotenv
import typer
from typing import List, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
@ -10,28 +13,52 @@ from parser.open_ai_func import call_openai_api, get_user_permission
dotenv.load_dotenv()
#Specify your folder HERE
directory_to_ingest = 'inputs'
app = typer.Typer(add_completion=False)
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
#Splits all files in specified folder to documents
raw_docs = SimpleDirectoryReader(input_dir=directory_to_ingest).load_data()
raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
# Here we split the documents, as needed, into smaller chunks.
# We do this due to the context limits of the LLMs.
text_splitter = RecursiveCharacterTextSplitter()
docs = text_splitter.split_documents(raw_docs)
# Here we check for command line arguments for bot calls.
# If no argument exists or the permission_bypass_flag argument is not '-y',
# user permission is requested to call the API.
if len(sys.argv) > 1:
permission_bypass_flag = sys.argv[1]
if permission_bypass_flag == '-y':
call_openai_api(docs)
@app.command()
def ingest(directory: Optional[str] = typer.Option("inputs",
help="Path to the directory for index creation."),
files: Optional[List[str]] = typer.Option(None,
help="""File paths to use (Optional; overrides directory).
E.g. --files inputs/1.md --files inputs/2.md"""),
recursive: Optional[bool] = typer.Option(True,
help="Whether to recursively search in subdirectories."),
limit: Optional[int] = typer.Option(None,
help="Maximum number of files to read."),
formats: Optional[List[str]] = typer.Option([".rst", ".md"],
help="""List of required extensions (list with .)
Currently supported: .rst, .md, .pdf, .docx, .csv, .epub"""),
exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles).")):
"""
Creates index from specified location or files.
By default /inputs folder is used, .rst and .md are parsed.
"""
raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=files, recursive=recursive,
required_exts=formats, num_files_limit=limit,
exclude_hidden=exclude).load_data()
raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
print(raw_docs)
# Here we split the documents, as needed, into smaller chunks.
# We do this due to the context limits of the LLMs.
text_splitter = RecursiveCharacterTextSplitter()
docs = text_splitter.split_documents(raw_docs)
# Here we check for command line arguments for bot calls.
# If no argument exists or the permission_bypass_flag argument is not '-y',
# user permission is requested to call the API.
if len(sys.argv) > 1:
permission_bypass_flag = sys.argv[1]
if permission_bypass_flag == '-y':
call_openai_api(docs)
else:
get_user_permission(docs)
else:
get_user_permission(docs)
else:
get_user_permission(docs)
if __name__ == "__main__":
app()

@ -29,6 +29,18 @@ def convert_rst_to_txt(src_dir, dst_dir):
f"-D source_suffix=.rst " \
f"-C {dst_dir} "
sphinx_main(args.split())
elif file.endswith(".md"):
# Rename the .md file to .rst file
src_file = os.path.join(root, file)
dst_file = os.path.join(root, file.replace(".md", ".rst"))
os.rename(src_file, dst_file)
# Convert the .rst file to .txt file using sphinx-build
args = f". -b text -D extensions=sphinx.ext.autodoc " \
f"-D master_doc={dst_file} " \
f"-D source_suffix=.rst " \
f"-C {dst_dir} "
sphinx_main(args.split())
def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost.

@ -24,6 +24,8 @@ class RstParser(BaseParser):
remove_hyperlinks: bool = True,
remove_images: bool = True,
remove_table_excess: bool = True,
remove_interpreters: bool = True,
remove_directives: bool = True,
remove_whitespaces_excess: bool = True,
#Be carefull with remove_characters_excess, might cause data loss
remove_characters_excess: bool = True,
@ -34,6 +36,8 @@ class RstParser(BaseParser):
self._remove_hyperlinks = remove_hyperlinks
self._remove_images = remove_images
self._remove_table_excess = remove_table_excess
self._remove_interpreters = remove_interpreters
self._remove_directives = remove_directives
self._remove_whitespaces_excess = remove_whitespaces_excess
self._remove_characters_excess = remove_characters_excess
@ -95,6 +99,18 @@ class RstParser(BaseParser):
content = re.sub(pattern, r"\1", content)
return content
def remove_directives(self, content: str) -> str:
"""Removes reStructuredText Directives"""
pattern = r"`\.\.([^:]+)::"
content = re.sub(pattern, "", content)
return content
def remove_interpreters(self, content: str) -> str:
"""Removes reStructuredText Interpreted Text Roles"""
pattern = r":(\w+):"
content = re.sub(pattern, "", content)
return content
def remove_table_excess(self, content: str) -> str:
"""Pattern to remove grid table separators"""
pattern = r"^\+[-]+\+[-]+\+$"
@ -129,6 +145,10 @@ class RstParser(BaseParser):
content = self.remove_images(content)
if self._remove_table_excess:
content = self.remove_table_excess(content)
if self._remove_directives:
content = self.remove_directives(content)
if self._remove_interpreters:
content = self.remove_interpreters(content)
rst_tups = self.rst_to_tups(content)
if self._remove_whitespaces_excess:
rst_tups = self.remove_whitespaces_excess(rst_tups)

@ -14,10 +14,38 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int:
def call_openai_api(docs):
# Function to create a vector store from the documents and save it to disk.
store = FAISS.from_documents(docs, OpenAIEmbeddings())
from tqdm import tqdm
docs_test = [docs[0]]
# remove the first element from docs
docs.pop(0)
# cut first n docs if you want to restart
#docs = docs[:n]
c1 = 0
store = FAISS.from_documents(docs_test, OpenAIEmbeddings())
for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'):
try:
import time
store.add_texts([i.page_content], metadatas=[i.metadata])
except Exception as e:
print(e)
print("Error on ", i)
print("Saving progress")
print(f"stopped at {c1} out of {len(docs)}")
faiss.write_index(store.index, "docs.index")
store_index_bak = store.index
store.index = None
with open("faiss_store.pkl", "wb") as f:
pickle.dump(store, f)
print("Sleeping for 60 seconds and trying again")
time.sleep(60)
faiss.write_index(store_index_bak, "docs.index")
store.index = store_index_bak
store.add_texts([i.page_content], metadatas=[i.metadata])
c1 += 1
faiss.write_index(store.index, "docs.index")
store.index = None
with open("faiss_store.pkl", "wb") as f:
pickle.dump(store, f)
@ -41,4 +69,4 @@ def get_user_permission(docs):
elif user_input == "":
call_openai_api(docs)
else:
print("The API was not called. No money was spent.")
print("The API was not called. No money was spent.")

Loading…
Cancel
Save