diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..92cc718 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,44 @@ +name: Build and push DocsGPT Docker image + +on: + workflow_dispatch: + push: + branches: + - main + +jobs: + deploy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v1 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v1 + + - name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Login to ghcr.io + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.repository_owner }} + password: ${{ secrets.GHCR_TOKEN }} + + # Runs a single command using the runners shell + - name: Build and push Docker images to docker.io and ghcr.io + uses: docker/build-push-action@v2 + with: + file: './application/Dockerfile' + platforms: linux/amd64 + context: ./application + push: true + tags: | + ${{ secrets.DOCKER_USERNAME }}/docsgpt:latest + ghcr.io/${{ github.repository_owner }}/docsgpt:latest diff --git a/.gitignore b/.gitignore index 8b394e9..0003c21 100644 --- a/.gitignore +++ b/.gitignore @@ -161,3 +161,4 @@ frontend/*.sw? application/vectors/ +**/inputs diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..0c10d0b --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,38 @@ +# Welcome to DocsGPT Contributing guideline + +Thank you for choosing this project to contribute to, we are all very grateful! + +# We accept different types of contributions + +๐Ÿ“ฃ Discussions - where you can start a new topic or answer some questions + +๐Ÿž Issues - Is how we track tasks, sometimes its bugs that need fixing, sometimes its new features + +๐Ÿ› ๏ธ Pull requests - Is how you can suggest changes to our repository, to work on existing issue or to add new features + +๐Ÿ“š Wiki - where we have our documentation + + +## ๐Ÿž Issues and Pull requests + +We value contributions to our issues in form of discussion or suggestion, we recommend that you check out existing issues and our [Roadmap](https://github.com/orgs/arc53/projects/2) + +If you want to contribute by writing code there are few things that you should know before doing it: +We have frontend (React, Vite) and Backend (python) + +### If you are looking to contribute to Frontend (โš›๏ธReact, Vite): +Current frontend is being migrated from /application to /frontend with a new design, so please contribute to the new on. Check out this [Milestone](https://github.com/arc53/DocsGPT/milestone/1) and its issues also [Figma](https://www.figma.com/file/OXLtrl1EAy885to6S69554/DocsGPT?node-id=0%3A1&t=hjWVuxRg9yi5YkJ9-1) +Please try to follow guidelines + + +### If you are looking to contribute to Backend (๐ŸPython): +Check out our issues, and contribute to /application or /scripts (ignore old ingest_rst.py ingest_rst_sphinx.py files, they will be deprecated soon) +Currently we don't have any tests(which would be useful๐Ÿ˜‰) but before submitting you PR make sure that after you ingested some test data its queryable + +### Workflow: +Create a fork, make changes on your forked repository, submit changes in a form of pull request + +## Questions / collaboration +Please join our [Discord](https://discord.gg/n5BX8dh8rU) don't hesitate, we are very friendly and welcoming to new contributors. + +# Thank you so much for considering to contribute to DocsGPT!๐Ÿ™ diff --git a/README.md b/README.md index 9b81171..ccd1430 100644 --- a/README.md +++ b/README.md @@ -57,7 +57,7 @@ Copy .env_sample and create .env with your openai api token ## [Guides](https://github.com/arc53/docsgpt/wiki) - +## [Interested in contributing?](https://github.com/arc53/DocsGPT/blob/main/CONTRIBUTING.md) ## [How to use any other documentation](https://github.com/arc53/docsgpt/wiki/How-to-train-on-other-documentation) diff --git a/application/app.py b/application/app.py index aa9089e..c114c63 100644 --- a/application/app.py +++ b/application/app.py @@ -5,8 +5,8 @@ import datetime from flask import Flask, request, render_template # os.environ["LANGCHAIN_HANDLER"] = "langchain" import faiss -from langchain import OpenAI -from langchain.chains import VectorDBQAWithSourcesChain +from langchain import OpenAI, VectorDBQA +from langchain.chains.question_answering import load_qa_chain from langchain.prompts import PromptTemplate import requests @@ -69,11 +69,22 @@ def api_answer(): c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template) # create a chain with the prompt template and the store - chain = VectorDBQAWithSourcesChain.from_llm(llm=OpenAI(openai_api_key=api_key, temperature=0), vectorstore=store, combine_prompt=c_prompt) + #chain = VectorDBQA.from_llm(llm=OpenAI(openai_api_key=api_key, temperature=0), vectorstore=store, combine_prompt=c_prompt) + # chain = VectorDBQA.from_chain_type(llm=OpenAI(openai_api_key=api_key, temperature=0), chain_type='map_reduce', + # vectorstore=store) + + qa_chain = load_qa_chain(OpenAI(openai_api_key=api_key, temperature=0), chain_type="map_reduce", + combine_prompt=c_prompt) + chain = VectorDBQA(combine_documents_chain=qa_chain, vectorstore=store) + + + # fetch the answer - result = chain({"question": question}) + result = chain({"query": question}) + print(result) # some formatting for the frontend + result['answer'] = result['result'] result['answer'] = result['answer'].replace("\\n", "
") result['answer'] = result['answer'].replace("SOURCES:", "") # mock result diff --git a/application/requirements.txt b/application/requirements.txt index 9e8f73b..7972f8c 100644 --- a/application/requirements.txt +++ b/application/requirements.txt @@ -60,6 +60,7 @@ tiktoken==0.1.2 tokenizers==0.13.2 tqdm==4.64.1 transformers==4.26.0 +typer==0.7.0 typing-inspect==0.8.0 typing_extensions==4.4.0 urllib3==1.26.14 diff --git a/scripts/ingest.py b/scripts/ingest.py index cebb6c3..3082cf4 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -1,6 +1,9 @@ import sys import nltk import dotenv +import typer + +from typing import List, Optional from langchain.text_splitter import RecursiveCharacterTextSplitter @@ -10,28 +13,52 @@ from parser.open_ai_func import call_openai_api, get_user_permission dotenv.load_dotenv() -#Specify your folder HERE -directory_to_ingest = 'inputs' +app = typer.Typer(add_completion=False) -nltk.download('punkt') -nltk.download('averaged_perceptron_tagger') +nltk.download('punkt', quiet=True) +nltk.download('averaged_perceptron_tagger', quiet=True) #Splits all files in specified folder to documents -raw_docs = SimpleDirectoryReader(input_dir=directory_to_ingest).load_data() -raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] -# Here we split the documents, as needed, into smaller chunks. -# We do this due to the context limits of the LLMs. -text_splitter = RecursiveCharacterTextSplitter() -docs = text_splitter.split_documents(raw_docs) - -# Here we check for command line arguments for bot calls. -# If no argument exists or the permission_bypass_flag argument is not '-y', -# user permission is requested to call the API. -if len(sys.argv) > 1: - permission_bypass_flag = sys.argv[1] - if permission_bypass_flag == '-y': - call_openai_api(docs) +@app.command() +def ingest(directory: Optional[str] = typer.Option("inputs", + help="Path to the directory for index creation."), + files: Optional[List[str]] = typer.Option(None, + help="""File paths to use (Optional; overrides directory). + E.g. --files inputs/1.md --files inputs/2.md"""), + recursive: Optional[bool] = typer.Option(True, + help="Whether to recursively search in subdirectories."), + limit: Optional[int] = typer.Option(None, + help="Maximum number of files to read."), + formats: Optional[List[str]] = typer.Option([".rst", ".md"], + help="""List of required extensions (list with .) + Currently supported: .rst, .md, .pdf, .docx, .csv, .epub"""), + exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles).")): + + """ + Creates index from specified location or files. + By default /inputs folder is used, .rst and .md are parsed. + """ + raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=files, recursive=recursive, + required_exts=formats, num_files_limit=limit, + exclude_hidden=exclude).load_data() + raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs] + print(raw_docs) + # Here we split the documents, as needed, into smaller chunks. + # We do this due to the context limits of the LLMs. + text_splitter = RecursiveCharacterTextSplitter() + docs = text_splitter.split_documents(raw_docs) + + # Here we check for command line arguments for bot calls. + # If no argument exists or the permission_bypass_flag argument is not '-y', + # user permission is requested to call the API. + if len(sys.argv) > 1: + permission_bypass_flag = sys.argv[1] + if permission_bypass_flag == '-y': + call_openai_api(docs) + else: + get_user_permission(docs) else: get_user_permission(docs) -else: - get_user_permission(docs) \ No newline at end of file + +if __name__ == "__main__": + app() diff --git a/scripts/ingest_rst.py b/scripts/old/ingest_rst.py similarity index 100% rename from scripts/ingest_rst.py rename to scripts/old/ingest_rst.py diff --git a/scripts/ingest_rst_sphinx.py b/scripts/old/ingest_rst_sphinx.py similarity index 88% rename from scripts/ingest_rst_sphinx.py rename to scripts/old/ingest_rst_sphinx.py index 9d6c8ec..ecc7157 100644 --- a/scripts/ingest_rst_sphinx.py +++ b/scripts/old/ingest_rst_sphinx.py @@ -29,6 +29,18 @@ def convert_rst_to_txt(src_dir, dst_dir): f"-D source_suffix=.rst " \ f"-C {dst_dir} " sphinx_main(args.split()) + elif file.endswith(".md"): + # Rename the .md file to .rst file + src_file = os.path.join(root, file) + dst_file = os.path.join(root, file.replace(".md", ".rst")) + os.rename(src_file, dst_file) + # Convert the .rst file to .txt file using sphinx-build + args = f". -b text -D extensions=sphinx.ext.autodoc " \ + f"-D master_doc={dst_file} " \ + f"-D source_suffix=.rst " \ + f"-C {dst_dir} " + sphinx_main(args.split()) + def num_tokens_from_string(string: str, encoding_name: str) -> int: # Function to convert string to tokens and estimate user cost. diff --git a/scripts/parser/file/rst_parser.py b/scripts/parser/file/rst_parser.py index 0b887d4..7c97b32 100644 --- a/scripts/parser/file/rst_parser.py +++ b/scripts/parser/file/rst_parser.py @@ -24,6 +24,8 @@ class RstParser(BaseParser): remove_hyperlinks: bool = True, remove_images: bool = True, remove_table_excess: bool = True, + remove_interpreters: bool = True, + remove_directives: bool = True, remove_whitespaces_excess: bool = True, #Be carefull with remove_characters_excess, might cause data loss remove_characters_excess: bool = True, @@ -34,6 +36,8 @@ class RstParser(BaseParser): self._remove_hyperlinks = remove_hyperlinks self._remove_images = remove_images self._remove_table_excess = remove_table_excess + self._remove_interpreters = remove_interpreters + self._remove_directives = remove_directives self._remove_whitespaces_excess = remove_whitespaces_excess self._remove_characters_excess = remove_characters_excess @@ -95,6 +99,18 @@ class RstParser(BaseParser): content = re.sub(pattern, r"\1", content) return content + def remove_directives(self, content: str) -> str: + """Removes reStructuredText Directives""" + pattern = r"`\.\.([^:]+)::" + content = re.sub(pattern, "", content) + return content + + def remove_interpreters(self, content: str) -> str: + """Removes reStructuredText Interpreted Text Roles""" + pattern = r":(\w+):" + content = re.sub(pattern, "", content) + return content + def remove_table_excess(self, content: str) -> str: """Pattern to remove grid table separators""" pattern = r"^\+[-]+\+[-]+\+$" @@ -129,6 +145,10 @@ class RstParser(BaseParser): content = self.remove_images(content) if self._remove_table_excess: content = self.remove_table_excess(content) + if self._remove_directives: + content = self.remove_directives(content) + if self._remove_interpreters: + content = self.remove_interpreters(content) rst_tups = self.rst_to_tups(content) if self._remove_whitespaces_excess: rst_tups = self.remove_whitespaces_excess(rst_tups) diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py index 500e488..7009132 100644 --- a/scripts/parser/open_ai_func.py +++ b/scripts/parser/open_ai_func.py @@ -14,10 +14,38 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int: def call_openai_api(docs): # Function to create a vector store from the documents and save it to disk. - store = FAISS.from_documents(docs, OpenAIEmbeddings()) + from tqdm import tqdm + docs_test = [docs[0]] + # remove the first element from docs + docs.pop(0) + # cut first n docs if you want to restart + #docs = docs[:n] + c1 = 0 + store = FAISS.from_documents(docs_test, OpenAIEmbeddings()) + for i in tqdm(docs, desc="Embedding ๐Ÿฆ–", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'): + try: + import time + store.add_texts([i.page_content], metadatas=[i.metadata]) + except Exception as e: + print(e) + print("Error on ", i) + print("Saving progress") + print(f"stopped at {c1} out of {len(docs)}") + faiss.write_index(store.index, "docs.index") + store_index_bak = store.index + store.index = None + with open("faiss_store.pkl", "wb") as f: + pickle.dump(store, f) + print("Sleeping for 60 seconds and trying again") + time.sleep(60) + faiss.write_index(store_index_bak, "docs.index") + store.index = store_index_bak + store.add_texts([i.page_content], metadatas=[i.metadata]) + c1 += 1 + + faiss.write_index(store.index, "docs.index") store.index = None - with open("faiss_store.pkl", "wb") as f: pickle.dump(store, f) @@ -41,4 +69,4 @@ def get_user_permission(docs): elif user_input == "": call_openai_api(docs) else: - print("The API was not called. No money was spent.") \ No newline at end of file + print("The API was not called. No money was spent.")