Proper PEP8 formatting

1 year ago · 168648e789
parent 7f56f57778
commit 168648e789
14 changed files with 138 additions and 122 deletions
--- a/application/app.py
+++ b/application/app.py
@ -90,10 +90,12 @@ mongo = MongoClient(app.config['MONGO_URI'])
 db = mongo["docsgpt"]
 vectors_collection = db["vectors"]

+
 async def async_generate(chain, question, chat_history):
    result = await chain.arun({"question": question, "chat_history": chat_history})
    return result

+
 def run_async_chain(chain, question, chat_history):
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
@ -105,6 +107,7 @@ def run_async_chain(chain, question, chat_history):
    result["answer"] = answer
    return result

+
@celery.task(bind=True)
 def ingest(self, directory, formats, name_job, filename, user):
    resp = ingest_worker(self, directory, formats, name_job, filename, user)
--- a/scripts/code_docs_gen.py
+++ b/scripts/code_docs_gen.py
@ -11,10 +11,10 @@ import tiktoken
 import sys
 from argparse import ArgumentParser
 import ast
+import json

 dotenv.load_dotenv()

-
 ps = list(Path("inputs").glob("**/*.py"))
 data = []
 sources = []
@ -24,7 +24,6 @@ for p in ps:
    sources.append(p)


-
 # with open('inputs/client.py', 'r') as f:
 #     tree = ast.parse(f.read())

@ -64,11 +63,9 @@ for code in data:
    c1 += 1

 # save the structure dict as json
-import json
 with open('structure_dict.json', 'w') as f:
    json.dump(structure_dict, f)

-
 # llm = OpenAI(temperature=0)
 # prompt = PromptTemplate(
 #     input_variables=["code"],
@ -119,8 +116,3 @@ for source, classes in structure_dict.items():
            else:
                with open(f"outputs/{source_w}", "a") as f:
                    f.write(f"\n\nFunction: {functions[function]}, \nDocumentation: {response}")
-
-
-
-
-
--- a/scripts/ingest.py
+++ b/scripts/ingest.py
@ -16,7 +16,6 @@ from parser.js2doc import extract_functions_and_classes as extract_js
 from parser.java2doc import extract_functions_and_classes as extract_java
 from parser.token_func import group_split

-
 dotenv.load_dotenv()

 app = typer.Typer(add_completion=False)
@ -41,12 +40,12 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
                                                       help="""List of required extensions (list with .)
                                                        Currently supported: .rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""),
           exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles)."),
-           sample: Optional[bool] = typer.Option(False, help="Whether to output sample of the first 5 split documents."),
+           sample: Optional[bool] = typer.Option(False,
+                                                 help="Whether to output sample of the first 5 split documents."),
           token_check: Optional[bool] = typer.Option(True, help="Whether to group small documents and split large."),
           min_tokens: Optional[int] = typer.Option(150, help="Minimum number of tokens to not group."),
           max_tokens: Optional[int] = typer.Option(2000, help="Maximum number of tokens to not split."),
           ):
-
    """
        Creates index from specified location or files.
        By default /inputs folder is used, .rst and .md are parsed.
@ -59,7 +58,8 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,

        # Here we split the documents, as needed, into smaller chunks.
        # We do this due to the context limits of the LLMs.
-        raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens, token_check=token_check)
+        raw_docs = group_split(documents=raw_docs, min_tokens=min_tokens, max_tokens=max_tokens,
+                               token_check=token_check)
        # Old method
        # text_splitter = RecursiveCharacterTextSplitter()
        # docs = text_splitter.split_documents(raw_docs)
@ -71,7 +71,6 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,

        docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]

-
        # Here we check for command line arguments for bot calls.
        # If no argument exists or the yes is not True, then the
        # user permission is requested to call the API.
@ -103,7 +102,6 @@ def convert(dir: Optional[str] = typer.Option("inputs",
            formats: Optional[str] = typer.Option("py",
                                                  help="""Required language. 
                                                        py, js, java supported for now""")):
-
    """
            Creates documentation linked to original functions from specified location.
            By default /inputs folder is used, .py is parsed.
@ -117,7 +115,7 @@ def convert(dir: Optional[str] = typer.Option("inputs",
    else:
        raise Exception("Sorry, language not supported yet")
    transform_to_docs(functions_dict, classes_dict, formats, dir)
-if __name__ == "__main__":
-  app()


+if __name__ == "__main__":
+    app()
--- a/scripts/parser/file/html_parser.py
+++ b/scripts/parser/file/html_parser.py
@ -9,6 +9,7 @@ from typing import Dict, Union

 from parser.file.base_parser import BaseParser

+
 class HTMLParser(BaseParser):
    """HTML parser."""

--- a/scripts/parser/file/markdown_parser.py
+++ b/scripts/parser/file/markdown_parser.py
@ -35,8 +35,8 @@ class MarkdownParser(BaseParser):
        self._max_tokens = max_tokens
        # self._remove_tables = remove_tables

-
-    def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str):
+    def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str],
+                          current_text: str):
        """Append to tups chunk."""
        num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
        if num_tokens > self._max_tokens:
@ -46,6 +46,7 @@ class MarkdownParser(BaseParser):
        else:
            tups.append((current_header, current_text))
        return tups
+
    def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
        """Convert a markdown file to a dictionary.

--- a/scripts/parser/file/rst_parser.py
+++ b/scripts/parser/file/rst_parser.py
@ -10,6 +10,7 @@ from typing import Any, Dict, List, Optional, Tuple, Union, cast
 from parser.file.base_parser import BaseParser
 import tiktoken

+
 class RstParser(BaseParser):
    """reStructuredText parser.

@ -41,7 +42,6 @@ class RstParser(BaseParser):
        self._remove_whitespaces_excess = remove_whitespaces_excess
        self._remove_characters_excess = remove_characters_excess

-
    def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
        """Convert a reStructuredText file to a dictionary.

@ -56,7 +56,8 @@ class RstParser(BaseParser):

        for i, line in enumerate(lines):
            header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line)
-            if header_match and i > 0 and (len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
+            if header_match and i > 0 and (
+                    len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
                if current_header is not None:
                    if current_text == "" or None:
                        continue
--- a/scripts/parser/java2doc.py
+++ b/scripts/parser/java2doc.py
@ -1,6 +1,7 @@
 import os
 import javalang

+
 def find_files(directory):
    files_list = []
    for root, dirs, files in os.walk(directory):
@ -9,6 +10,7 @@ def find_files(directory):
                files_list.append(os.path.join(root, file))
    return files_list

+
 def extract_functions(file_path):
    with open(file_path, "r") as file:
        java_code = file.read()
@ -28,6 +30,7 @@ def extract_functions(file_path):
            methods[method_name] = method_source_code
    return methods

+
 def extract_classes(file_path):
    with open(file_path, 'r') as file:
        source_code = file.read()
@ -47,6 +50,7 @@ def extract_classes(file_path):
            classes[class_name] = class_string
    return classes

+
 def extract_functions_and_classes(directory):
    files = find_files(directory)
    functions_dict = {}
--- a/scripts/parser/js2doc.py
+++ b/scripts/parser/js2doc.py
@ -11,6 +11,7 @@ def find_files(directory):
                files_list.append(os.path.join(root, file))
    return files_list

+
 def extract_functions(file_path):
    with open(file_path, 'r') as file:
        source_code = file.read()
@ -38,6 +39,7 @@ def extract_functions(file_path):
                                functions[func_name] = escodegen.generate(declaration.init)
        return functions

+
 def extract_classes(file_path):
    with open(file_path, 'r') as file:
        source_code = file.read()
@ -53,6 +55,7 @@ def extract_classes(file_path):
                classes[class_name] = ", ".join(function_names)
    return classes

+
 def extract_functions_and_classes(directory):
    files = find_files(directory)
    functions_dict = {}
--- a/scripts/parser/open_ai_func.py
+++ b/scripts/parser/open_ai_func.py
@ -12,7 +12,6 @@ from langchain.embeddings import OpenAIEmbeddings
 from retry import retry


-
 def num_tokens_from_string(string: str, encoding_name: str) -> int:
    # Function to convert string to tokens and estimate user cost.
    encoding = tiktoken.get_encoding(encoding_name)
@ -20,11 +19,13 @@ def num_tokens_from_string(string: str, encoding_name: str) -> int:
    total_price = ((num_tokens / 1000) * 0.0004)
    return num_tokens, total_price

+
@retry(tries=10, delay=60)
 def store_add_texts_with_retry(store, i):
    store.add_texts([i.page_content], metadatas=[i.metadata])
    # store_pine.add_texts([i.page_content], metadatas=[i.metadata])

+
 def call_openai_api(docs, folder_name):
    # Function to create a vector store from the documents and save it to disk.

@ -51,7 +52,8 @@ def call_openai_api(docs, folder_name):
    # model_name = "sentence-transformers/all-mpnet-base-v2"
    # hf = HuggingFaceEmbeddings(model_name=model_name)
    # store = FAISS.from_documents(docs_test, hf)
-    for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'):
+    for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs),
+                  bar_format='{l_bar}{bar}| Time Left: {remaining}'):
        try:
            store_add_texts_with_retry(store, i)
        except Exception as e:
@ -64,6 +66,7 @@ def call_openai_api(docs, folder_name):
        c1 += 1
    store.save_local(f"outputs/{folder_name}")

+
 def get_user_permission(docs, folder_name):
    # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
@ -72,7 +75,6 @@ def get_user_permission(docs, folder_name):
    for doc in docs:
        docs_content += doc.page_content

-
    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
    print(f"Number of Tokens = {format(tokens, ',d')}")
--- a/scripts/parser/py2doc.py
+++ b/scripts/parser/py2doc.py
@ -5,6 +5,7 @@ from pathlib import Path
 from langchain.llms import OpenAI
 from langchain.prompts import PromptTemplate

+
 def find_files(directory):
    files_list = []
    for root, dirs, files in os.walk(directory):
@ -13,6 +14,7 @@ def find_files(directory):
                files_list.append(os.path.join(root, file))
    return files_list

+
 def extract_functions(file_path):
    with open(file_path, 'r') as file:
        source_code = file.read()
@ -25,6 +27,7 @@ def extract_functions(file_path):
                functions[func_name] = func_def
    return functions

+
 def extract_classes(file_path):
    with open(file_path, 'r') as file:
        source_code = file.read()
@ -40,6 +43,7 @@ def extract_classes(file_path):
                classes[class_name] = ", ".join(function_names)
    return classes

+
 def extract_functions_and_classes(directory):
    files = find_files(directory)
    functions_dict = {}
@ -53,6 +57,7 @@ def extract_functions_and_classes(directory):
            classes_dict[file] = classes
    return functions_dict, classes_dict

+
 def parse_functions(functions_dict, formats, dir):
    c1 = len(functions_dict)
    for i, (source, functions) in enumerate(functions_dict.items(), start=1):
@ -70,7 +75,8 @@ def parse_functions(functions_dict, formats, dir):
            response = llm(prompt.format(code=function))
            mode = "a" if Path(f"outputs/{source_w}").exists() else "w"
            with open(f"outputs/{source_w}", mode) as f:
-                f.write(f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
+                f.write(
+                    f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")


 def parse_classes(classes_dict, formats, dir):
@ -92,6 +98,7 @@ def parse_classes(classes_dict, formats, dir):
            with open(f"outputs/{source_w}", "a" if Path(f"outputs/{source_w}").exists() else "w") as f:
                f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}")

+
 def transform_to_docs(functions_dict, classes_dict, formats, dir):
    docs_content = ''.join([str(key) + str(value) for key, value in functions_dict.items()])
    docs_content += ''.join([str(key) + str(value) for key, value in classes_dict.items()])
--- a/scripts/parser/token_func.py
+++ b/scripts/parser/token_func.py
@ -13,6 +13,7 @@ def separate_header_and_body(text):
    body = text[len(header):]
    return header, body

+
 def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]:
    docs = []
    current_group = None
@ -23,7 +24,8 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
        if current_group is None:
            current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
                                     extra_info=doc.extra_info)
-        elif len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
+        elif len(tiktoken.get_encoding("cl100k_base").encode(
+                current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
            current_group.text += " " + doc.text
        else:
            docs.append(current_group)
@ -35,6 +37,7 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)

    return docs

+
 def split_documents(documents: List[Document], max_tokens: int) -> List[Document]:
    docs = []
    for doc in documents:
@ -54,6 +57,7 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document
                docs.append(new_doc)
    return docs

+
 def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
    if token_check == False:
        return documents