Linting

* validate python formatting on every build with Ruff * fix lint warnings
1 year ago · 962becb9a5
parent 168648e789
commit 962becb9a5
35 changed files with 271 additions and 246 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -23,7 +23,7 @@ jobs:
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
-          
+
      - name: Login to ghcr.io
        uses: docker/login-action@v2
        with:
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -0,0 +1,17 @@
 name: Python linting
 on:
  push:
    branches:
      - '*'
  pull_request:
    types: [ opened, synchronize ]
 jobs:
  ruff:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v3
      - name: Lint with Ruff
        uses: chartboost/ruff-action@v1
--- a/.ruff.toml
+++ b/.ruff.toml
@ -0,0 +1,2 @@
 # Allow lines to be as long as 120 characters.
 line-length = 120
--- a/application/app.py
+++ b/application/app.py
@ -1,8 +1,9 @@
 import asyncio
 import datetime
 import http.client
 import json
 import os
 import traceback
 import asyncio
 import dotenv
 import requests
@ -26,10 +27,9 @@ from langchain.prompts.chat import (
 from pymongo import MongoClient
 from werkzeug.utils import secure_filename
 from core.settings import settings
 from error import bad_request
 from worker import ingest_worker
 from core.settings import settings
 import celeryconfig
 # os.environ["LANGCHAIN_HANDLER"] = "langchain"
@ -177,18 +177,12 @@ def api_answer():
        q_prompt = PromptTemplate(input_variables=["context", "question"], template=template_quest,
                                  template_format="jinja2")
        if settings.LLM_NAME == "openai_chat":
-            # llm = ChatOpenAI(openai_api_key=api_key, model_name="gpt-4")
+            llm = ChatOpenAI(openai_api_key=api_key)  # optional parameter: model_name="gpt-4"
            llm = ChatOpenAI(openai_api_key=api_key)
            messages_combine = [
                SystemMessagePromptTemplate.from_template(chat_combine_template),
                HumanMessagePromptTemplate.from_template("{question}")
            ]
            p_chat_combine = ChatPromptTemplate.from_messages(messages_combine)
            messages_reduce = [
                SystemMessagePromptTemplate.from_template(chat_reduce_template),
                HumanMessagePromptTemplate.from_template("{question}")
            ]
            p_chat_reduce = ChatPromptTemplate.from_messages(messages_reduce)
        elif settings.LLM_NAME == "openai":
            llm = OpenAI(openai_api_key=api_key, temperature=0)
        elif settings.LLM_NAME == "manifest":
@ -226,7 +220,7 @@ def api_answer():
        result['answer'] = result['answer'].replace("\\n", "\n")
        try:
            result['answer'] = result['answer'].split("SOURCES:")[0]
-        except:
+        except Exception:
            pass
        # mock result
@ -295,7 +289,7 @@ def api_feedback():
            "feedback": feedback
        })
    )
-    return {"status": 'ok'}
+    return {"status": http.client.responses.get(response.status_code, 'ok')}
@app.route('/api/combine', methods=['GET'])
--- a/application/celeryconfig.py
+++ b/application/celeryconfig.py
@ -1,7 +1,8 @@
 import os
 broker_url = os.getenv("CELERY_BROKER_URL")
 result_backend = os.getenv("CELERY_RESULT_BACKEND")
 task_serializer = 'json'
 result_serializer = 'json'
-accept_content = ['json']
+accept_content = ['json']
--- a/application/core/settings.py
+++ b/application/core/settings.py
@ -1,6 +1,7 @@
 from pydantic import BaseSettings
 from pathlib import Path
 from pydantic import BaseSettings
 class Settings(BaseSettings):
    LLM_NAME: str = "openai_chat"
--- a/application/error.py
+++ b/application/error.py
@ -1,13 +1,15 @@
 from flask import jsonify
 from werkzeug.http import HTTP_STATUS_CODES
-def response_error(code_status,message=None):
+
-    payload = {'error':HTTP_STATUS_CODES.get(code_status,"something went wrong")}
+def response_error(code_status, message=None):
    payload = {'error': HTTP_STATUS_CODES.get(code_status, "something went wrong")}
    if message:
        payload['message'] = message
    response = jsonify(payload)
    response.status_code = code_status
    return response
-def bad_request(status_code=400,message=''):
+
-    return response_error(code_status=status_code,message=message)
+def bad_request(status_code=400, message=''):
    return response_error(code_status=status_code, message=message)
--- a/application/parser/file/base.py
+++ b/application/parser/file/base.py
@ -3,7 +3,6 @@ from abc import abstractmethod
 from typing import Any, List
 from langchain.docstore.document import Document as LCDocument
 from parser.schema.base import Document
--- a/application/parser/file/html_parser.py
+++ b/application/parser/file/html_parser.py
@ -9,6 +9,7 @@ from typing import Dict, Union
 from parser.file.base_parser import BaseParser
 class HTMLParser(BaseParser):
    """HTML parser."""
@ -23,38 +24,37 @@ class HTMLParser(BaseParser):
            Union[str, List[str]]: a string or a List of strings.
        """
        try:
-            import unstructured
+            from unstructured.partition.html import partition_html
            from unstructured.staging.base import convert_to_isd
            from unstructured.cleaners.core import clean
        except ImportError:
            raise ValueError("unstructured package is required to parse HTML files.")
        from unstructured.partition.html import partition_html
        from unstructured.staging.base import convert_to_isd
        from unstructured.cleaners.core import clean
        # Using the unstructured library to convert the html to isd format
        # isd sample : isd = [
-                            #   {"text": "My Title", "type": "Title"},
+        #   {"text": "My Title", "type": "Title"},
-                            #   {"text": "My Narrative", "type": "NarrativeText"}
+        #   {"text": "My Narrative", "type": "NarrativeText"}
-                            # ]
+        # ]
        with open(file, "r", encoding="utf-8") as fp:
            elements = partition_html(file=fp)
-            isd = convert_to_isd(elements)  
+            isd = convert_to_isd(elements)
-        # Removing non ascii charactwers from isd_el['text']
+            # Removing non ascii charactwers from isd_el['text']
        for isd_el in isd:
            isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
        # Removing all the \n characters from isd_el['text'] using regex and replace with single space
        # Removing all the extra spaces  from isd_el['text'] using regex and replace with single space
        for isd_el in isd:
-            isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
+            isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
-            isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
+            isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
        # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
        for isd_el in isd:
-            clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
+            clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
        # Creating a list of all the indexes of isd_el['type'] = 'Title'
-        title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
+        title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
        # Creating 'Chunks' - List of lists of strings 
        # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
@ -64,19 +64,20 @@ class HTMLParser(BaseParser):
        Chunks = [[]]
        final_chunks = list(list())
-        for i,isd_el in enumerate(isd):
+        for i, isd_el in enumerate(isd):
            if i in title_indexes:
                Chunks.append([])
            Chunks[-1].append(isd_el['text'])
-        # Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
+        # Removing all the chunks with sum of lenth of all the strings in the chunk < 25
        # TODO: This value can be an user defined variable
        for chunk in Chunks:
            # sum of lenth of all the strings in the chunk
            sum = 0
            sum += len(str(chunk))
            if sum < 25:
                Chunks.remove(chunk)
-            else :         
+            else:
                # appending all the approved chunks to final_chunks as a single string       
                final_chunks.append(" ".join([str(item) for item in chunk]))
        return final_chunks
--- a/application/parser/file/markdown_parser.py
+++ b/application/parser/file/markdown_parser.py
@ -7,8 +7,8 @@ import re
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 from parser.file.base_parser import BaseParser
 import tiktoken
 from parser.file.base_parser import BaseParser
 class MarkdownParser(BaseParser):
@ -20,13 +20,13 @@ class MarkdownParser(BaseParser):
    """
    def __init__(
-        self,
+            self,
-        *args: Any,
+            *args: Any,
-        remove_hyperlinks: bool = True,
+            remove_hyperlinks: bool = True,
-        remove_images: bool = True,
+            remove_images: bool = True,
-        max_tokens: int = 2048,
+            max_tokens: int = 2048,
-        # remove_tables: bool = True,
+            # remove_tables: bool = True,
-        **kwargs: Any,
+            **kwargs: Any,
    ) -> None:
        """Init params."""
        super().__init__(*args, **kwargs)
@ -35,8 +35,8 @@ class MarkdownParser(BaseParser):
        self._max_tokens = max_tokens
        # self._remove_tables = remove_tables
-
+    def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str],
-    def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str):
+                          current_text: str):
        """Append to tups chunk."""
        num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
        if num_tokens > self._max_tokens:
@ -46,6 +46,7 @@ class MarkdownParser(BaseParser):
        else:
            tups.append((current_header, current_text))
        return tups
    def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
        """Convert a markdown file to a dictionary.
@ -115,7 +116,7 @@ class MarkdownParser(BaseParser):
        return {}
    def parse_tups(
-        self, filepath: Path, errors: str = "ignore"
+            self, filepath: Path, errors: str = "ignore"
    ) -> List[Tuple[Optional[str], str]]:
        """Parse file into tuples."""
        with open(filepath, "r") as f:
@ -130,7 +131,7 @@ class MarkdownParser(BaseParser):
        return markdown_tups
    def parse_file(
-        self, filepath: Path, errors: str = "ignore"
+            self, filepath: Path, errors: str = "ignore"
    ) -> Union[str, List[str]]:
        """Parse file into string."""
        tups = self.parse_tups(filepath, errors=errors)
--- a/application/parser/file/rst_parser.py
+++ b/application/parser/file/rst_parser.py
@ -5,10 +5,10 @@ Contains parser for md files.
 """
 import re
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Union
 from parser.file.base_parser import BaseParser
-import tiktoken
+
 class RstParser(BaseParser):
    """reStructuredText parser.
@ -19,17 +19,17 @@ class RstParser(BaseParser):
    """
    def __init__(
-        self,
+            self,
-        *args: Any,
+            *args: Any,
-        remove_hyperlinks: bool = True,
+            remove_hyperlinks: bool = True,
-        remove_images: bool = True,
+            remove_images: bool = True,
-        remove_table_excess: bool = True,
+            remove_table_excess: bool = True,
-        remove_interpreters: bool = True,
+            remove_interpreters: bool = True,
-        remove_directives: bool = True,
+            remove_directives: bool = True,
-        remove_whitespaces_excess: bool = True,
+            remove_whitespaces_excess: bool = True,
-        #Be carefull with remove_characters_excess, might cause data loss
+            # Be carefull with remove_characters_excess, might cause data loss
-        remove_characters_excess: bool = True,
+            remove_characters_excess: bool = True,
-        **kwargs: Any,
+            **kwargs: Any,
    ) -> None:
        """Init params."""
        super().__init__(*args, **kwargs)
@ -41,7 +41,6 @@ class RstParser(BaseParser):
        self._remove_whitespaces_excess = remove_whitespaces_excess
        self._remove_characters_excess = remove_characters_excess
    def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
        """Convert a reStructuredText file to a dictionary.
@ -56,7 +55,8 @@ class RstParser(BaseParser):
        for i, line in enumerate(lines):
            header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line)
-            if header_match and i > 0 and (len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
+            if header_match and i > 0 and (
                    len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
                if current_header is not None:
                    if current_text == "" or None:
                        continue
@ -72,7 +72,7 @@ class RstParser(BaseParser):
        rst_tups.append((current_header, current_text))
-        #TODO: Format for rst
+        # TODO: Format for rst
        #
        # if current_header is not None:
        #     # pass linting, assert keys are defined
@ -136,7 +136,7 @@ class RstParser(BaseParser):
        return {}
    def parse_tups(
-        self, filepath: Path, errors: str = "ignore"
+            self, filepath: Path, errors: str = "ignore"
    ) -> List[Tuple[Optional[str], str]]:
        """Parse file into tuples."""
        with open(filepath, "r") as f:
@ -159,7 +159,7 @@ class RstParser(BaseParser):
        return rst_tups
    def parse_file(
-        self, filepath: Path, errors: str = "ignore"
+            self, filepath: Path, errors: str = "ignore"
    ) -> Union[str, List[str]]:
        """Parse file into string."""
        tups = self.parse_tups(filepath, errors=errors)
--- a/application/parser/file/tabular_parser.py
+++ b/application/parser/file/tabular_parser.py
@ -77,13 +77,13 @@ class PandasCSVParser(BaseParser):
    """
    def __init__(
-        self,
+            self,
-        *args: Any,
+            *args: Any,
-        concat_rows: bool = True,
+            concat_rows: bool = True,
-        col_joiner: str = ", ",
+            col_joiner: str = ", ",
-        row_joiner: str = "\n",
+            row_joiner: str = "\n",
-        pandas_config: dict = {},
+            pandas_config: dict = {},
-        **kwargs: Any
+            **kwargs: Any
    ) -> None:
        """Init params."""
        super().__init__(*args, **kwargs)
--- a/application/parser/java2doc.py
+++ b/application/parser/java2doc.py
@ -1,6 +1,8 @@
 import os
 import javalang
 def find_files(directory):
    files_list = []
    for root, dirs, files in os.walk(directory):
@ -9,6 +11,7 @@ def find_files(directory):
                files_list.append(os.path.join(root, file))
    return files_list
 def extract_functions(file_path):
    with open(file_path, "r") as file:
        java_code = file.read()
@ -28,6 +31,7 @@ def extract_functions(file_path):
            methods[method_name] = method_source_code
    return methods
 def extract_classes(file_path):
    with open(file_path, 'r') as file:
        source_code = file.read()
@ -47,6 +51,7 @@ def extract_classes(file_path):
            classes[class_name] = class_string
    return classes
 def extract_functions_and_classes(directory):
    files = find_files(directory)
    functions_dict = {}
@ -58,4 +63,4 @@ def extract_functions_and_classes(directory):
        classes = extract_classes(file)
        if classes:
            classes_dict[file] = classes
-    return functions_dict, classes_dict
+    return functions_dict, classes_dict
--- a/application/parser/js2doc.py
+++ b/application/parser/js2doc.py
@ -1,6 +1,7 @@
 import os
-import esprima
+
 import escodegen
 import esprima
 def find_files(directory):
@ -11,6 +12,7 @@ def find_files(directory):
                files_list.append(os.path.join(root, file))
    return files_list
 def extract_functions(file_path):
    with open(file_path, 'r') as file:
        source_code = file.read()
@ -26,7 +28,6 @@ def extract_functions(file_path):
                        func_name = declaration.id.name if declaration.id else '<anonymous>'
                        functions[func_name] = escodegen.generate(declaration.init)
            elif node.type == 'ClassDeclaration':
                class_name = node.id.name
                for subnode in node.body.body:
                    if subnode.type == 'MethodDefinition':
                        func_name = subnode.key.name
@ -38,6 +39,7 @@ def extract_functions(file_path):
                                functions[func_name] = escodegen.generate(declaration.init)
        return functions
 def extract_classes(file_path):
    with open(file_path, 'r') as file:
        source_code = file.read()
@ -53,6 +55,7 @@ def extract_classes(file_path):
                classes[class_name] = ", ".join(function_names)
    return classes
 def extract_functions_and_classes(directory):
    files = find_files(directory)
    functions_dict = {}
--- a/application/parser/open_ai_func.py
+++ b/application/parser/open_ai_func.py
@ -1,32 +1,32 @@
 import os
-import faiss
+
 import pickle
 import tiktoken
 from langchain.vectorstores import FAISS
 from langchain.embeddings import OpenAIEmbeddings
-
+from langchain.vectorstores import FAISS
 #from langchain.embeddings import HuggingFaceEmbeddings
 #from langchain.embeddings import HuggingFaceInstructEmbeddings
 #from langchain.embeddings import CohereEmbeddings
 from retry import retry
 # from langchain.embeddings import HuggingFaceEmbeddings
 # from langchain.embeddings import HuggingFaceInstructEmbeddings
 # from langchain.embeddings import CohereEmbeddings
 def num_tokens_from_string(string: str, encoding_name: str) -> int:
-# Function to convert string to tokens and estimate user cost.
+    # Function to convert string to tokens and estimate user cost.
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
-    total_price = ((num_tokens/1000) * 0.0004)
+    total_price = ((num_tokens / 1000) * 0.0004)
    return num_tokens, total_price
@retry(tries=10, delay=60)
 def store_add_texts_with_retry(store, i):
    store.add_texts([i.page_content], metadatas=[i.metadata])
-    #store_pine.add_texts([i.page_content], metadatas=[i.metadata])
+    # store_pine.add_texts([i.page_content], metadatas=[i.metadata])
 def call_openai_api(docs, folder_name, task_status):
-# Function to create a vector store from the documents and save it to disk.
+    # Function to create a vector store from the documents and save it to disk.
    # create output folder if it doesn't exist
    if not os.path.exists(f"{folder_name}"):
@ -44,7 +44,8 @@ def call_openai_api(docs, folder_name, task_status):
    # hf = HuggingFaceEmbeddings(model_name=model_name)
    # store = FAISS.from_documents(docs_test, hf)
    s1 = len(docs)
-    for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'):
+    for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs),
                  bar_format='{l_bar}{bar}| Time Left: {remaining}'):
        try:
            task_status.update_state(state='PROGRESS', meta={'current': int((c1 / s1) * 100)})
            store_add_texts_with_retry(store, i)
@ -58,20 +59,20 @@ def call_openai_api(docs, folder_name, task_status):
        c1 += 1
    store.save_local(f"{folder_name}")
 def get_user_permission(docs, folder_name):
-# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
+    # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
-    #docs_content = (" ".join(docs))
+    # docs_content = (" ".join(docs))
    docs_content = ""
    for doc in docs:
        docs_content += doc.page_content
    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
    print(f"Number of Tokens = {format(tokens, ',d')}")
    print(f"Approx Cost = ${format(total_price, ',.2f')}")
-    #Here we check for user permission before calling the API.
+    # Here we check for user permission before calling the API.
    user_input = input("Price Okay? (Y/N) \n").lower()
    if user_input == "y":
        call_openai_api(docs, folder_name)
--- a/application/parser/py2doc.py
+++ b/application/parser/py2doc.py
@ -1,10 +1,12 @@
 import os
 import ast
-import tiktoken
+import os
 from pathlib import Path
 import tiktoken
 from langchain.llms import OpenAI
 from langchain.prompts import PromptTemplate
 def find_files(directory):
    files_list = []
    for root, dirs, files in os.walk(directory):
@ -13,6 +15,7 @@ def find_files(directory):
                files_list.append(os.path.join(root, file))
    return files_list
 def extract_functions(file_path):
    with open(file_path, 'r') as file:
        source_code = file.read()
@ -25,6 +28,7 @@ def extract_functions(file_path):
                functions[func_name] = func_def
    return functions
 def extract_classes(file_path):
    with open(file_path, 'r') as file:
        source_code = file.read()
@ -40,6 +44,7 @@ def extract_classes(file_path):
                classes[class_name] = ", ".join(function_names)
    return classes
 def extract_functions_and_classes(directory):
    files = find_files(directory)
    functions_dict = {}
@ -53,11 +58,12 @@ def extract_functions_and_classes(directory):
            classes_dict[file] = classes
    return functions_dict, classes_dict
 def parse_functions(functions_dict, formats, dir):
    c1 = len(functions_dict)
    for i, (source, functions) in enumerate(functions_dict.items(), start=1):
        print(f"Processing file {i}/{c1}")
-        source_w = source.replace(dir+"/", "").replace("."+formats, ".md")
+        source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
        subfolders = "/".join(source_w.split("/")[:-1])
        Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
        for j, (name, function) in enumerate(functions.items(), start=1):
@ -70,18 +76,19 @@ def parse_functions(functions_dict, formats, dir):
            response = llm(prompt.format(code=function))
            mode = "a" if Path(f"outputs/{source_w}").exists() else "w"
            with open(f"outputs/{source_w}", mode) as f:
-                f.write(f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
+                f.write(
                    f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
 def parse_classes(classes_dict, formats, dir):
    c1 = len(classes_dict)
    for i, (source, classes) in enumerate(classes_dict.items()):
-        print(f"Processing file {i+1}/{c1}")
+        print(f"Processing file {i + 1}/{c1}")
-        source_w = source.replace(dir+"/", "").replace("."+formats, ".md")
+        source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
        subfolders = "/".join(source_w.split("/")[:-1])
        Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
        for name, function_names in classes.items():
-            print(f"Processing Class {i+1}/{c1}")
+            print(f"Processing Class {i + 1}/{c1}")
            prompt = PromptTemplate(
                input_variables=["class_name", "functions_names"],
                template="Class name: {class_name} \nFunctions: {functions_names}, \nDocumentation: ",
@ -92,6 +99,7 @@ def parse_classes(classes_dict, formats, dir):
            with open(f"outputs/{source_w}", "a" if Path(f"outputs/{source_w}").exists() else "w") as f:
                f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}")
 def transform_to_docs(functions_dict, classes_dict, formats, dir):
    docs_content = ''.join([str(key) + str(value) for key, value in functions_dict.items()])
    docs_content += ''.join([str(key) + str(value) for key, value in classes_dict.items()])
@ -110,4 +118,4 @@ def transform_to_docs(functions_dict, classes_dict, formats, dir):
        parse_classes(classes_dict, formats, dir)
        print("All done!")
    else:
-        print("The API was not called. No money was spent.")
+        print("The API was not called. No money was spent.")
--- a/application/parser/schema/base.py
+++ b/application/parser/schema/base.py
@ -2,7 +2,6 @@
 from dataclasses import dataclass
 from langchain.docstore.document import Document as LCDocument
 from parser.schema.schema import BaseDocument
--- a/application/parser/token_func.py
+++ b/application/parser/token_func.py
@ -1,9 +1,9 @@
 import re
-import tiktoken
+from math import ceil
 from typing import List
 import tiktoken
 from parser.schema.base import Document
 from math import ceil
 def separate_header_and_body(text):
@ -13,6 +13,7 @@ def separate_header_and_body(text):
    body = text[len(header):]
    return header, body
 def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]:
    docs = []
    current_group = None
@ -23,7 +24,8 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
        if current_group is None:
            current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
                                     extra_info=doc.extra_info)
-        elif len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
+        elif len(tiktoken.get_encoding("cl100k_base").encode(
                current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
            current_group.text += " " + doc.text
        else:
            docs.append(current_group)
@ -35,6 +37,7 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
    return docs
 def split_documents(documents: List[Document], max_tokens: int) -> List[Document]:
    docs = []
    for doc in documents:
@ -54,17 +57,18 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document
                docs.append(new_doc)
    return docs
 def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
-    if token_check == False:
+    if not token_check:
        return documents
    print("Grouping small documents")
    try:
        documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
-    except:
+    except Exception:
        print("Grouping failed, try running without token_check")
    print("Separating large documents")
    try:
        documents = split_documents(documents=documents, max_tokens=max_tokens)
-    except:
+    except Exception:
        print("Grouping failed, try running without token_check")
    return documents
--- a/application/worker.py
+++ b/application/worker.py
@ -1,18 +1,17 @@
 import requests
 import nltk
 import os
 import shutil
 import string
 import zipfile
 from urllib.parse import urljoin
 import nltk
 import requests
 from core.settings import settings
 from parser.file.bulk import SimpleDirectoryReader
 from parser.schema.base import Document
 from parser.open_ai_func import call_openai_api
 from parser.schema.base import Document
 from parser.token_func import group_split
 from urllib.parse import urljoin
 from core.settings import settings
 import string
 import zipfile
 import shutil
 try:
    nltk.download('punkt', quiet=True)
@ -50,7 +49,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
    with open(full_path + '/' + filename, 'wb') as f:
        f.write(file)
-    #check if file is .zip and extract it
+    # check if file is .zip and extract it
    if filename.endswith('.zip'):
        with zipfile.ZipFile(full_path + '/' + filename, 'r') as zip_ref:
            zip_ref.extractall(full_path)
@ -68,7 +67,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
    call_openai_api(docs, full_path, self)
    self.update_state(state='PROGRESS', meta={'current': 100})
-    if sample == True:
+    if sample:
        for i in range(min(5, len(raw_docs))):
            print(raw_docs[i].text)
--- a/application/wsgi.py
+++ b/application/wsgi.py
@ -1,4 +1,4 @@
 from app import app
 if __name__ == "__main__":
-    app.run(debug=True, port=5001)
+    app.run(debug=True, port=5001)
--- a/extensions/chatwoot/app.py
+++ b/extensions/chatwoot/app.py
@ -1,18 +1,20 @@
 import requests
 import dotenv
 import os
 import json
 import pprint
 import dotenv
 import requests
 from flask import Flask, request
 dotenv.load_dotenv()
 docsgpt_url = os.getenv("docsgpt_url")
 chatwoot_url = os.getenv("chatwoot_url")
 docsgpt_key = os.getenv("docsgpt_key")
 chatwoot_token = os.getenv("chatwoot_token")
-#account_id = os.getenv("account_id")
+# account_id = os.getenv("account_id")
-#assignee_id = os.getenv("assignee_id")
+# assignee_id = os.getenv("assignee_id")
 label_stop = "human-requested"
 def send_to_bot(sender, message):
    data = {
        'sender': sender,
@ -43,7 +45,6 @@ def send_to_chatwoot(account, conversation, message):
    return r.json()
 from flask import Flask, request
 app = Flask(__name__)
@ -74,7 +75,7 @@ def docsgpt():
    # elif str(assignee) != str(assignee_id):
    #     return "Not the right assignee"
-    if(message_type == "incoming"):
+    if (message_type == "incoming"):
        bot_response = send_to_bot(contact, message)
        create_message = send_to_chatwoot(
            account, conversation, bot_response)
@ -83,5 +84,6 @@ def docsgpt():
    return create_message
 if __name__ == '__main__':
-    app.run(host='0.0.0.0', port=80)
+    app.run(host='0.0.0.0', port=80)
--- a/scripts/code_docs_gen.py
+++ b/scripts/code_docs_gen.py
@ -1,17 +1,10 @@
 import ast
 import json
 from pathlib import Path
-from langchain.text_splitter import CharacterTextSplitter
+
-import faiss
+import dotenv
 from langchain.vectorstores import FAISS
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.llms import OpenAI
 from langchain.prompts import PromptTemplate
 import pickle
 import dotenv
 import tiktoken
 import sys
 from argparse import ArgumentParser
 import ast
 import json
 dotenv.load_dotenv()
@ -24,12 +17,6 @@ for p in ps:
    sources.append(p)
 # with open('inputs/client.py', 'r') as f:
 #     tree = ast.parse(f.read())
 # print(tree)
 def get_functions_in_class(node):
    functions = []
    functions_code = []
@ -66,16 +53,6 @@ for code in data:
 with open('structure_dict.json', 'w') as f:
    json.dump(structure_dict, f)
 # llm = OpenAI(temperature=0)
 # prompt = PromptTemplate(
 #     input_variables=["code"],
 #     template="Code: {code}, Documentation: ",
 # )
 #
 # print(prompt.format(code="print('hello world')"))
 # print(llm(prompt.format(code="print('hello world')")))
 if not Path("outputs").exists():
    Path("outputs").mkdir()
--- a/scripts/ingest.py
+++ b/scripts/ingest.py
@ -1,19 +1,19 @@
 import os
 import sys
 import nltk
 import dotenv
 import typer
 from collections import defaultdict
 from typing import List, Optional
 import dotenv
 import nltk
 import typer
 from parser.file.bulk import SimpleDirectoryReader
-from parser.schema.base import Document
+from parser.java2doc import extract_functions_and_classes as extract_java
 from parser.js2doc import extract_functions_and_classes as extract_js
 from parser.open_ai_func import call_openai_api, get_user_permission
 from parser.py2doc import transform_to_docs
 from parser.py2doc import extract_functions_and_classes as extract_py
-from parser.js2doc import extract_functions_and_classes as extract_js
+from parser.py2doc import transform_to_docs
-from parser.java2doc import extract_functions_and_classes as extract_java
+from parser.schema.base import Document
 from parser.token_func import group_split
 dotenv.load_dotenv()
@ -38,7 +38,8 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
           limit: Optional[int] = typer.Option(None, help="Maximum number of files to read."),
           formats: Optional[List[str]] = typer.Option([".rst", ".md"],
                                                       help="""List of required extensions (list with .)
-                                                        Currently supported: .rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""),
+                                                        Currently supported: 
                                                        .rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""),
           exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles)."),
           sample: Optional[bool] = typer.Option(False,
                                                 help="Whether to output sample of the first 5 split documents."),
@ -65,7 +66,7 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
        # docs = text_splitter.split_documents(raw_docs)
        # Sample feature
-        if sample == True:
+        if sample:
            for i in range(min(5, len(raw_docs))):
                print(raw_docs[i].text)
--- a/scripts/old/ingest_rst.py
+++ b/scripts/old/ingest_rst.py
@ -1,38 +1,42 @@
 from pathlib import Path
 from langchain.text_splitter import CharacterTextSplitter
 import faiss
 from langchain.vectorstores import FAISS
 from langchain.embeddings import OpenAIEmbeddings
 import pickle
 import dotenv
 import tiktoken
 import sys
 from argparse import ArgumentParser
 from pathlib import Path
 import dotenv
 import faiss
 import tiktoken
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.vectorstores import FAISS
 def num_tokens_from_string(string: str, encoding_name: str) -> int:
-# Function to convert string to tokens and estimate user cost.
+    # Function to convert string to tokens and estimate user cost.
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
-    total_price = ((num_tokens/1000) * 0.0004)
+    total_price = ((num_tokens / 1000) * 0.0004)
    return num_tokens, total_price
 def call_openai_api():
-# Function to create a vector store from the documents and save it to disk.
+    # Function to create a vector store from the documents and save it to disk.
    store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
    faiss.write_index(store.index, "docs.index")
    store.index = None
    with open("faiss_store.pkl", "wb") as f:
        pickle.dump(store, f)
 def get_user_permission():
-# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
+    # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
    docs_content = (" ".join(docs))
    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
    print(f"Number of Tokens = {format(tokens, ',d')}")
    print(f"Approx Cost = ${format(total_price, ',.2f')}")
-    #Here we check for user permission before calling the API.
+    # Here we check for user permission before calling the API.
    user_input = input("Price Okay? (Y/N) \n").lower()
    if user_input == "y":
        call_openai_api()
@ -41,7 +45,8 @@ def get_user_permission():
    else:
        print("The API was not called. No money was spent.")
-#Load .env file
+
 # Load .env file
 dotenv.load_dotenv()
 ap = ArgumentParser("Script for training DocsGPT on .rst documentation files.")
--- a/scripts/old/ingest_rst_sphinx.py
+++ b/scripts/old/ingest_rst_sphinx.py
@ -1,71 +1,75 @@
 import os
 import pickle
 import dotenv
 import tiktoken
 import sys
 import faiss
 import shutil
 import sys
 from argparse import ArgumentParser
 from pathlib import Path
-from langchain.vectorstores import FAISS
+
 import dotenv
 import faiss
 import tiktoken
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.text_splitter import CharacterTextSplitter
 from langchain.vectorstores import FAISS
 from sphinx.cmd.build import main as sphinx_main
-from argparse import ArgumentParser
+
 def convert_rst_to_txt(src_dir, dst_dir):
-  # Check if the source directory exists
+    # Check if the source directory exists
-  if not os.path.exists(src_dir):
+    if not os.path.exists(src_dir):
-    raise Exception("Source directory does not exist")
+        raise Exception("Source directory does not exist")
-  # Walk through the source directory
+    # Walk through the source directory
-  for root, dirs, files in os.walk(src_dir):
+    for root, dirs, files in os.walk(src_dir):
-    for file in files:
+        for file in files:
-      # Check if the file has .rst extension
+            # Check if the file has .rst extension
-      if file.endswith(".rst"):
+            if file.endswith(".rst"):
-        # Construct the full path of the file
+                # Construct the full path of the file
-        src_file = os.path.join(root, file.replace(".rst", ""))
+                src_file = os.path.join(root, file.replace(".rst", ""))
-        # Convert the .rst file to .txt file using sphinx-build
+                # Convert the .rst file to .txt file using sphinx-build
-        args = f". -b text -D extensions=sphinx.ext.autodoc " \
+                args = f". -b text -D extensions=sphinx.ext.autodoc " \
-               f"-D master_doc={src_file} " \
+                       f"-D master_doc={src_file} " \
-               f"-D source_suffix=.rst " \
+                       f"-D source_suffix=.rst " \
-               f"-C {dst_dir} "
+                       f"-C {dst_dir} "
-        sphinx_main(args.split())
+                sphinx_main(args.split())
-      elif file.endswith(".md"):
+            elif file.endswith(".md"):
-        # Rename the .md file to .rst file
+                # Rename the .md file to .rst file
-        src_file = os.path.join(root, file)
+                src_file = os.path.join(root, file)
-        dst_file = os.path.join(root, file.replace(".md", ".rst"))
+                dst_file = os.path.join(root, file.replace(".md", ".rst"))
-        os.rename(src_file, dst_file)
+                os.rename(src_file, dst_file)
-        # Convert the .rst file to .txt file using sphinx-build
+                # Convert the .rst file to .txt file using sphinx-build
-        args = f". -b text -D extensions=sphinx.ext.autodoc " \
+                args = f". -b text -D extensions=sphinx.ext.autodoc " \
-                f"-D master_doc={dst_file} " \
+                       f"-D master_doc={dst_file} " \
-                f"-D source_suffix=.rst " \
+                       f"-D source_suffix=.rst " \
-                f"-C {dst_dir} "
+                       f"-C {dst_dir} "
-        sphinx_main(args.split())
+                sphinx_main(args.split())
 def num_tokens_from_string(string: str, encoding_name: str) -> int:
-# Function to convert string to tokens and estimate user cost.
+    # Function to convert string to tokens and estimate user cost.
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
-    total_price = ((num_tokens/1000) * 0.0004)
+    total_price = ((num_tokens / 1000) * 0.0004)
    return num_tokens, total_price
 def call_openai_api():
-# Function to create a vector store from the documents and save it to disk.
+    # Function to create a vector store from the documents and save it to disk.
    store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
    faiss.write_index(store.index, "docs.index")
    store.index = None
    with open("faiss_store.pkl", "wb") as f:
        pickle.dump(store, f)
 def get_user_permission():
-# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
+    # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
    docs_content = (" ".join(docs))
    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
    print(f"Number of Tokens = {format(tokens, ',d')}")
    print(f"Approx Cost = ${format(total_price, ',.2f')}")
-    #Here we check for user permission before calling the API.
+    # Here we check for user permission before calling the API.
    user_input = input("Price Okay? (Y/N) \n").lower()
    if user_input == "y":
        call_openai_api()
@ -74,6 +78,7 @@ def get_user_permission():
    else:
        print("The API was not called. No money was spent.")
 ap = ArgumentParser("Script for training DocsGPT on Sphinx documentation")
 ap.add_argument("-i", "--inputs",
                type=str,
@ -81,17 +86,17 @@ ap.add_argument("-i", "--inputs",
                help="Directory containing documentation files")
 args = ap.parse_args()
-#Load .env file
+# Load .env file
 dotenv.load_dotenv()
-#Directory to vector
+# Directory to vector
 src_dir = args.inputs
 dst_dir = "tmp"
 convert_rst_to_txt(src_dir, dst_dir)
 # Here we load in the data in the format that Notion exports it in.
-ps = list(Path("tmp/"+ src_dir).glob("**/*.txt"))
+ps = list(Path("tmp/" + src_dir).glob("**/*.txt"))
 # parse all child directories
 data = []
--- a/scripts/parser/file/base.py
+++ b/scripts/parser/file/base.py
@ -3,7 +3,6 @@ from abc import abstractmethod
 from typing import Any, List
 from langchain.docstore.document import Document as LCDocument
 from parser.schema.base import Document
--- a/scripts/parser/file/html_parser.py
+++ b/scripts/parser/file/html_parser.py
@ -24,12 +24,11 @@ class HTMLParser(BaseParser):
            Union[str, List[str]]: a string or a List of strings.
        """
        try:
-            import unstructured
+            from unstructured.partition.html import partition_html
            from unstructured.staging.base import convert_to_isd
            from unstructured.cleaners.core import clean
        except ImportError:
            raise ValueError("unstructured package is required to parse HTML files.")
        from unstructured.partition.html import partition_html
        from unstructured.staging.base import convert_to_isd
        from unstructured.cleaners.core import clean
        # Using the unstructured library to convert the html to isd format
        # isd sample : isd = [
@ -70,7 +69,8 @@ class HTMLParser(BaseParser):
                Chunks.append([])
            Chunks[-1].append(isd_el['text'])
-        # Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
+        # Removing all the chunks with sum of lenth of all the strings in the chunk < 25
        # TODO: This value can be a user defined variable
        for chunk in Chunks:
            # sum of lenth of all the strings in the chunk
            sum = 0
--- a/scripts/parser/file/markdown_parser.py
+++ b/scripts/parser/file/markdown_parser.py
@ -7,8 +7,8 @@ import re
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union, cast
 from parser.file.base_parser import BaseParser
 import tiktoken
 from parser.file.base_parser import BaseParser
 class MarkdownParser(BaseParser):
--- a/scripts/parser/file/rst_parser.py
+++ b/scripts/parser/file/rst_parser.py
@ -5,10 +5,9 @@ Contains parser for md files.
 """
 import re
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Union
 from parser.file.base_parser import BaseParser
 import tiktoken
 class RstParser(BaseParser):
--- a/scripts/parser/java2doc.py
+++ b/scripts/parser/java2doc.py
@ -1,4 +1,5 @@
 import os
 import javalang
--- a/scripts/parser/js2doc.py
+++ b/scripts/parser/js2doc.py
@ -1,6 +1,7 @@
 import os
-import esprima
+
 import escodegen
 import esprima
 def find_files(directory):
@ -27,7 +28,6 @@ def extract_functions(file_path):
                        func_name = declaration.id.name if declaration.id else '<anonymous>'
                        functions[func_name] = escodegen.generate(declaration.init)
            elif node.type == 'ClassDeclaration':
                class_name = node.id.name
                for subnode in node.body.body:
                    if subnode.type == 'MethodDefinition':
                        func_name = subnode.key.name
--- a/scripts/parser/open_ai_func.py
+++ b/scripts/parser/open_ai_func.py
@ -1,16 +1,15 @@
 import os
-import faiss
+
 import pickle
 import tiktoken
 from langchain.vectorstores import FAISS
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.vectorstores import FAISS
 from retry import retry
 # from langchain.embeddings import HuggingFaceEmbeddings
 # from langchain.embeddings import HuggingFaceInstructEmbeddings
 # from langchain.embeddings import CohereEmbeddings
 from retry import retry
 def num_tokens_from_string(string: str, encoding_name: str) -> int:
    # Function to convert string to tokens and estimate user cost.
--- a/scripts/parser/py2doc.py
+++ b/scripts/parser/py2doc.py
@ -1,7 +1,8 @@
 import os
 import ast
-import tiktoken
+import os
 from pathlib import Path
 import tiktoken
 from langchain.llms import OpenAI
 from langchain.prompts import PromptTemplate
--- a/scripts/parser/schema/base.py
+++ b/scripts/parser/schema/base.py
@ -2,7 +2,6 @@
 from dataclasses import dataclass
 from langchain.docstore.document import Document as LCDocument
 from parser.schema.schema import BaseDocument
--- a/scripts/parser/token_func.py
+++ b/scripts/parser/token_func.py
@ -1,9 +1,9 @@
 import re
-import tiktoken
+from math import ceil
 from typing import List
 import tiktoken
 from parser.schema.base import Document
 from math import ceil
 def separate_header_and_body(text):
@ -59,16 +59,16 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document
 def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
-    if token_check == False:
+    if not token_check:
        return documents
    print("Grouping small documents")
    try:
        documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
-    except:
+    except Exception:
        print("Grouping failed, try running without token_check")
    print("Separating large documents")
    try:
        documents = split_documents(documents=documents, max_tokens=max_tokens)
-    except:
+    except Exception:
        print("Grouping failed, try running without token_check")
    return documents
		`@ -0,0 +1,2 @@`
							`# Allow lines to be as long as 120 characters.`
							`line-length = 120`
`@ -1,4 +1,5 @@`
	`import os`	`import os`

	`import javalang`	`import javalang`