Merge pull request #232 from larinam/lint

Lint
2024-11-17 21:26:26 +00:00 · 2023-05-15 13:53:09 +01:00 · 2023-05-15 13:53:09 +01:00 · 059ffe09ea
commit 059ffe09ea
parent 36a845c29e 962becb9a5
35 changed files with 277 additions and 252 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -23,7 +23,7 @@ jobs:
        with:
          username: ${{ secrets.DOCKER_USERNAME }}
          password: ${{ secrets.DOCKER_PASSWORD }}
-          
+
      - name: Login to ghcr.io
        uses: docker/login-action@v2
        with:
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@ -0,0 +1,17 @@
+name: Python linting
+
+on:
+  push:
+    branches:
+      - '*'
+  pull_request:
+    types: [ opened, synchronize ]
+
+jobs:
+  ruff:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Lint with Ruff
+        uses: chartboost/ruff-action@v1
--- a/.ruff.toml
+++ b/.ruff.toml
@ -0,0 +1,2 @@
+# Allow lines to be as long as 120 characters.
+line-length = 120
--- a/application/app.py
+++ b/application/app.py
@ -1,8 +1,9 @@
+import asyncio
 import datetime
+import http.client
 import json
 import os
 import traceback
-import asyncio

 import dotenv
 import requests
@ -26,10 +27,9 @@ from langchain.prompts.chat import (
 from pymongo import MongoClient
 from werkzeug.utils import secure_filename

+from core.settings import settings
 from error import bad_request
 from worker import ingest_worker
-from core.settings import settings
-import celeryconfig

 # os.environ["LANGCHAIN_HANDLER"] = "langchain"

@ -177,18 +177,12 @@ def api_answer():
        q_prompt = PromptTemplate(input_variables=["context", "question"], template=template_quest,
                                  template_format="jinja2")
        if settings.LLM_NAME == "openai_chat":
-            # llm = ChatOpenAI(openai_api_key=api_key, model_name="gpt-4")
-            llm = ChatOpenAI(openai_api_key=api_key)
+            llm = ChatOpenAI(openai_api_key=api_key)  # optional parameter: model_name="gpt-4"
            messages_combine = [
                SystemMessagePromptTemplate.from_template(chat_combine_template),
                HumanMessagePromptTemplate.from_template("{question}")
            ]
            p_chat_combine = ChatPromptTemplate.from_messages(messages_combine)
-            messages_reduce = [
-                SystemMessagePromptTemplate.from_template(chat_reduce_template),
-                HumanMessagePromptTemplate.from_template("{question}")
-            ]
-            p_chat_reduce = ChatPromptTemplate.from_messages(messages_reduce)
        elif settings.LLM_NAME == "openai":
            llm = OpenAI(openai_api_key=api_key, temperature=0)
        elif settings.LLM_NAME == "manifest":
@ -226,7 +220,7 @@ def api_answer():
        result['answer'] = result['answer'].replace("\\n", "\n")
        try:
            result['answer'] = result['answer'].split("SOURCES:")[0]
-        except:
+        except Exception:
            pass

        # mock result
@ -295,7 +289,7 @@ def api_feedback():
            "feedback": feedback
        })
    )
-    return {"status": 'ok'}
+    return {"status": http.client.responses.get(response.status_code, 'ok')}


@app.route('/api/combine', methods=['GET'])
--- a/application/celeryconfig.py
+++ b/application/celeryconfig.py
@ -1,7 +1,8 @@
 import os
+
 broker_url = os.getenv("CELERY_BROKER_URL")
 result_backend = os.getenv("CELERY_RESULT_BACKEND")

 task_serializer = 'json'
 result_serializer = 'json'
-accept_content = ['json']
+accept_content = ['json']
--- a/application/core/settings.py
+++ b/application/core/settings.py
@ -1,6 +1,7 @@
-from pydantic import BaseSettings
 from pathlib import Path

+from pydantic import BaseSettings
+

 class Settings(BaseSettings):
    LLM_NAME: str = "openai_chat"
--- a/application/error.py
+++ b/application/error.py
@ -1,13 +1,15 @@
 from flask import jsonify
 from werkzeug.http import HTTP_STATUS_CODES

-def response_error(code_status,message=None):
-    payload = {'error':HTTP_STATUS_CODES.get(code_status,"something went wrong")}
+
+def response_error(code_status, message=None):
+    payload = {'error': HTTP_STATUS_CODES.get(code_status, "something went wrong")}
    if message:
        payload['message'] = message
    response = jsonify(payload)
    response.status_code = code_status
    return response

-def bad_request(status_code=400,message=''):
-    return response_error(code_status=status_code,message=message)
+
+def bad_request(status_code=400, message=''):
+    return response_error(code_status=status_code, message=message)
--- a/application/parser/file/base.py
+++ b/application/parser/file/base.py
@ -3,7 +3,6 @@ from abc import abstractmethod
 from typing import Any, List

 from langchain.docstore.document import Document as LCDocument
-
 from parser.schema.base import Document


--- a/application/parser/file/html_parser.py
+++ b/application/parser/file/html_parser.py
@ -9,6 +9,7 @@ from typing import Dict, Union

 from parser.file.base_parser import BaseParser

+
 class HTMLParser(BaseParser):
    """HTML parser."""

@ -23,38 +24,37 @@ class HTMLParser(BaseParser):
            Union[str, List[str]]: a string or a List of strings.
        """
        try:
-            import unstructured
+            from unstructured.partition.html import partition_html
+            from unstructured.staging.base import convert_to_isd
+            from unstructured.cleaners.core import clean
        except ImportError:
            raise ValueError("unstructured package is required to parse HTML files.")
-        from unstructured.partition.html import partition_html
-        from unstructured.staging.base import convert_to_isd
-        from unstructured.cleaners.core import clean

        # Using the unstructured library to convert the html to isd format
        # isd sample : isd = [
-                            #   {"text": "My Title", "type": "Title"},
-                            #   {"text": "My Narrative", "type": "NarrativeText"}
-                            # ]
+        #   {"text": "My Title", "type": "Title"},
+        #   {"text": "My Narrative", "type": "NarrativeText"}
+        # ]
        with open(file, "r", encoding="utf-8") as fp:
            elements = partition_html(file=fp)
-            isd = convert_to_isd(elements)  
+            isd = convert_to_isd(elements)

-        # Removing non ascii charactwers from isd_el['text']
+            # Removing non ascii charactwers from isd_el['text']
        for isd_el in isd:
            isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()

        # Removing all the \n characters from isd_el['text'] using regex and replace with single space
        # Removing all the extra spaces  from isd_el['text'] using regex and replace with single space
        for isd_el in isd:
-            isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
-            isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
+            isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
+            isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)

        # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
        for isd_el in isd:
-            clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
+            clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)

        # Creating a list of all the indexes of isd_el['type'] = 'Title'
-        title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
+        title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']

        # Creating 'Chunks' - List of lists of strings 
        # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
@ -64,19 +64,20 @@ class HTMLParser(BaseParser):
        Chunks = [[]]
        final_chunks = list(list())

-        for i,isd_el in enumerate(isd):
+        for i, isd_el in enumerate(isd):
            if i in title_indexes:
                Chunks.append([])
            Chunks[-1].append(isd_el['text'])

-        # Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
+        # Removing all the chunks with sum of lenth of all the strings in the chunk < 25
+        # TODO: This value can be an user defined variable
        for chunk in Chunks:
            # sum of lenth of all the strings in the chunk
            sum = 0
            sum += len(str(chunk))
            if sum < 25:
                Chunks.remove(chunk)
-            else :         
+            else:
                # appending all the approved chunks to final_chunks as a single string       
                final_chunks.append(" ".join([str(item) for item in chunk]))
        return final_chunks
--- a/application/parser/file/markdown_parser.py
+++ b/application/parser/file/markdown_parser.py
@ -7,8 +7,8 @@ import re
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union, cast

-from parser.file.base_parser import BaseParser
 import tiktoken
+from parser.file.base_parser import BaseParser


 class MarkdownParser(BaseParser):
@ -20,13 +20,13 @@ class MarkdownParser(BaseParser):
    """

    def __init__(
-        self,
-        *args: Any,
-        remove_hyperlinks: bool = True,
-        remove_images: bool = True,
-        max_tokens: int = 2048,
-        # remove_tables: bool = True,
-        **kwargs: Any,
+            self,
+            *args: Any,
+            remove_hyperlinks: bool = True,
+            remove_images: bool = True,
+            max_tokens: int = 2048,
+            # remove_tables: bool = True,
+            **kwargs: Any,
    ) -> None:
        """Init params."""
        super().__init__(*args, **kwargs)
@ -35,8 +35,8 @@ class MarkdownParser(BaseParser):
        self._max_tokens = max_tokens
        # self._remove_tables = remove_tables

-
-    def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str):
+    def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str],
+                          current_text: str):
        """Append to tups chunk."""
        num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
        if num_tokens > self._max_tokens:
@ -46,6 +46,7 @@ class MarkdownParser(BaseParser):
        else:
            tups.append((current_header, current_text))
        return tups
+
    def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
        """Convert a markdown file to a dictionary.

@ -115,7 +116,7 @@ class MarkdownParser(BaseParser):
        return {}

    def parse_tups(
-        self, filepath: Path, errors: str = "ignore"
+            self, filepath: Path, errors: str = "ignore"
    ) -> List[Tuple[Optional[str], str]]:
        """Parse file into tuples."""
        with open(filepath, "r") as f:
@ -130,7 +131,7 @@ class MarkdownParser(BaseParser):
        return markdown_tups

    def parse_file(
-        self, filepath: Path, errors: str = "ignore"
+            self, filepath: Path, errors: str = "ignore"
    ) -> Union[str, List[str]]:
        """Parse file into string."""
        tups = self.parse_tups(filepath, errors=errors)
--- a/application/parser/file/rst_parser.py
+++ b/application/parser/file/rst_parser.py
@ -5,10 +5,10 @@ Contains parser for md files.
 """
 import re
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Union

 from parser.file.base_parser import BaseParser
-import tiktoken
+

 class RstParser(BaseParser):
    """reStructuredText parser.
@ -19,17 +19,17 @@ class RstParser(BaseParser):
    """

    def __init__(
-        self,
-        *args: Any,
-        remove_hyperlinks: bool = True,
-        remove_images: bool = True,
-        remove_table_excess: bool = True,
-        remove_interpreters: bool = True,
-        remove_directives: bool = True,
-        remove_whitespaces_excess: bool = True,
-        #Be carefull with remove_characters_excess, might cause data loss
-        remove_characters_excess: bool = True,
-        **kwargs: Any,
+            self,
+            *args: Any,
+            remove_hyperlinks: bool = True,
+            remove_images: bool = True,
+            remove_table_excess: bool = True,
+            remove_interpreters: bool = True,
+            remove_directives: bool = True,
+            remove_whitespaces_excess: bool = True,
+            # Be carefull with remove_characters_excess, might cause data loss
+            remove_characters_excess: bool = True,
+            **kwargs: Any,
    ) -> None:
        """Init params."""
        super().__init__(*args, **kwargs)
@ -41,7 +41,6 @@ class RstParser(BaseParser):
        self._remove_whitespaces_excess = remove_whitespaces_excess
        self._remove_characters_excess = remove_characters_excess

-
    def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
        """Convert a reStructuredText file to a dictionary.

@ -56,7 +55,8 @@ class RstParser(BaseParser):

        for i, line in enumerate(lines):
            header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line)
-            if header_match and i > 0 and (len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
+            if header_match and i > 0 and (
+                    len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
                if current_header is not None:
                    if current_text == "" or None:
                        continue
@ -72,7 +72,7 @@ class RstParser(BaseParser):

        rst_tups.append((current_header, current_text))

-        #TODO: Format for rst
+        # TODO: Format for rst
        #
        # if current_header is not None:
        #     # pass linting, assert keys are defined
@ -136,7 +136,7 @@ class RstParser(BaseParser):
        return {}

    def parse_tups(
-        self, filepath: Path, errors: str = "ignore"
+            self, filepath: Path, errors: str = "ignore"
    ) -> List[Tuple[Optional[str], str]]:
        """Parse file into tuples."""
        with open(filepath, "r") as f:
@ -159,7 +159,7 @@ class RstParser(BaseParser):
        return rst_tups

    def parse_file(
-        self, filepath: Path, errors: str = "ignore"
+            self, filepath: Path, errors: str = "ignore"
    ) -> Union[str, List[str]]:
        """Parse file into string."""
        tups = self.parse_tups(filepath, errors=errors)
--- a/application/parser/file/tabular_parser.py
+++ b/application/parser/file/tabular_parser.py
@ -77,13 +77,13 @@ class PandasCSVParser(BaseParser):
    """

    def __init__(
-        self,
-        *args: Any,
-        concat_rows: bool = True,
-        col_joiner: str = ", ",
-        row_joiner: str = "\n",
-        pandas_config: dict = {},
-        **kwargs: Any
+            self,
+            *args: Any,
+            concat_rows: bool = True,
+            col_joiner: str = ", ",
+            row_joiner: str = "\n",
+            pandas_config: dict = {},
+            **kwargs: Any
    ) -> None:
        """Init params."""
        super().__init__(*args, **kwargs)
--- a/application/parser/java2doc.py
+++ b/application/parser/java2doc.py
@ -1,6 +1,8 @@
 import os
+
 import javalang

+
 def find_files(directory):
    files_list = []
    for root, dirs, files in os.walk(directory):
@ -9,6 +11,7 @@ def find_files(directory):
                files_list.append(os.path.join(root, file))
    return files_list

+
 def extract_functions(file_path):
    with open(file_path, "r") as file:
        java_code = file.read()
@ -28,6 +31,7 @@ def extract_functions(file_path):
            methods[method_name] = method_source_code
    return methods

+
 def extract_classes(file_path):
    with open(file_path, 'r') as file:
        source_code = file.read()
@ -47,6 +51,7 @@ def extract_classes(file_path):
            classes[class_name] = class_string
    return classes

+
 def extract_functions_and_classes(directory):
    files = find_files(directory)
    functions_dict = {}
@ -58,4 +63,4 @@ def extract_functions_and_classes(directory):
        classes = extract_classes(file)
        if classes:
            classes_dict[file] = classes
-    return functions_dict, classes_dict
+    return functions_dict, classes_dict
--- a/application/parser/js2doc.py
+++ b/application/parser/js2doc.py
@ -1,6 +1,7 @@
 import os
-import esprima
+
 import escodegen
+import esprima


 def find_files(directory):
@ -11,6 +12,7 @@ def find_files(directory):
                files_list.append(os.path.join(root, file))
    return files_list

+
 def extract_functions(file_path):
    with open(file_path, 'r') as file:
        source_code = file.read()
@ -26,7 +28,6 @@ def extract_functions(file_path):
                        func_name = declaration.id.name if declaration.id else '<anonymous>'
                        functions[func_name] = escodegen.generate(declaration.init)
            elif node.type == 'ClassDeclaration':
-                class_name = node.id.name
                for subnode in node.body.body:
                    if subnode.type == 'MethodDefinition':
                        func_name = subnode.key.name
@ -38,6 +39,7 @@ def extract_functions(file_path):
                                functions[func_name] = escodegen.generate(declaration.init)
        return functions

+
 def extract_classes(file_path):
    with open(file_path, 'r') as file:
        source_code = file.read()
@ -53,6 +55,7 @@ def extract_classes(file_path):
                classes[class_name] = ", ".join(function_names)
    return classes

+
 def extract_functions_and_classes(directory):
    files = find_files(directory)
    functions_dict = {}
--- a/application/parser/open_ai_func.py
+++ b/application/parser/open_ai_func.py
@ -1,32 +1,32 @@
 import os
-import faiss
-import pickle
+
 import tiktoken
-from langchain.vectorstores import FAISS
 from langchain.embeddings import OpenAIEmbeddings
-
-#from langchain.embeddings import HuggingFaceEmbeddings
-#from langchain.embeddings import HuggingFaceInstructEmbeddings
-#from langchain.embeddings import CohereEmbeddings
-
+from langchain.vectorstores import FAISS
 from retry import retry


+# from langchain.embeddings import HuggingFaceEmbeddings
+# from langchain.embeddings import HuggingFaceInstructEmbeddings
+# from langchain.embeddings import CohereEmbeddings
+

 def num_tokens_from_string(string: str, encoding_name: str) -> int:
-# Function to convert string to tokens and estimate user cost.
+    # Function to convert string to tokens and estimate user cost.
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
-    total_price = ((num_tokens/1000) * 0.0004)
+    total_price = ((num_tokens / 1000) * 0.0004)
    return num_tokens, total_price

+
@retry(tries=10, delay=60)
 def store_add_texts_with_retry(store, i):
    store.add_texts([i.page_content], metadatas=[i.metadata])
-    #store_pine.add_texts([i.page_content], metadatas=[i.metadata])
+    # store_pine.add_texts([i.page_content], metadatas=[i.metadata])
+

 def call_openai_api(docs, folder_name, task_status):
-# Function to create a vector store from the documents and save it to disk.
+    # Function to create a vector store from the documents and save it to disk.

    # create output folder if it doesn't exist
    if not os.path.exists(f"{folder_name}"):
@ -44,7 +44,8 @@ def call_openai_api(docs, folder_name, task_status):
    # hf = HuggingFaceEmbeddings(model_name=model_name)
    # store = FAISS.from_documents(docs_test, hf)
    s1 = len(docs)
-    for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'):
+    for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs),
+                  bar_format='{l_bar}{bar}| Time Left: {remaining}'):
        try:
            task_status.update_state(state='PROGRESS', meta={'current': int((c1 / s1) * 100)})
            store_add_texts_with_retry(store, i)
@ -58,20 +59,20 @@ def call_openai_api(docs, folder_name, task_status):
        c1 += 1
    store.save_local(f"{folder_name}")

+
 def get_user_permission(docs, folder_name):
-# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
+    # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
-    #docs_content = (" ".join(docs))
+    # docs_content = (" ".join(docs))
    docs_content = ""
    for doc in docs:
        docs_content += doc.page_content

-
    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
    print(f"Number of Tokens = {format(tokens, ',d')}")
    print(f"Approx Cost = ${format(total_price, ',.2f')}")
-    #Here we check for user permission before calling the API.
+    # Here we check for user permission before calling the API.
    user_input = input("Price Okay? (Y/N) \n").lower()
    if user_input == "y":
        call_openai_api(docs, folder_name)
--- a/application/parser/py2doc.py
+++ b/application/parser/py2doc.py
@ -1,10 +1,12 @@
-import os
 import ast
-import tiktoken
+import os
 from pathlib import Path
+
+import tiktoken
 from langchain.llms import OpenAI
 from langchain.prompts import PromptTemplate

+
 def find_files(directory):
    files_list = []
    for root, dirs, files in os.walk(directory):
@ -13,6 +15,7 @@ def find_files(directory):
                files_list.append(os.path.join(root, file))
    return files_list

+
 def extract_functions(file_path):
    with open(file_path, 'r') as file:
        source_code = file.read()
@ -25,6 +28,7 @@ def extract_functions(file_path):
                functions[func_name] = func_def
    return functions

+
 def extract_classes(file_path):
    with open(file_path, 'r') as file:
        source_code = file.read()
@ -40,6 +44,7 @@ def extract_classes(file_path):
                classes[class_name] = ", ".join(function_names)
    return classes

+
 def extract_functions_and_classes(directory):
    files = find_files(directory)
    functions_dict = {}
@ -53,11 +58,12 @@ def extract_functions_and_classes(directory):
            classes_dict[file] = classes
    return functions_dict, classes_dict

+
 def parse_functions(functions_dict, formats, dir):
    c1 = len(functions_dict)
    for i, (source, functions) in enumerate(functions_dict.items(), start=1):
        print(f"Processing file {i}/{c1}")
-        source_w = source.replace(dir+"/", "").replace("."+formats, ".md")
+        source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
        subfolders = "/".join(source_w.split("/")[:-1])
        Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
        for j, (name, function) in enumerate(functions.items(), start=1):
@ -70,18 +76,19 @@ def parse_functions(functions_dict, formats, dir):
            response = llm(prompt.format(code=function))
            mode = "a" if Path(f"outputs/{source_w}").exists() else "w"
            with open(f"outputs/{source_w}", mode) as f:
-                f.write(f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
+                f.write(
+                    f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")


 def parse_classes(classes_dict, formats, dir):
    c1 = len(classes_dict)
    for i, (source, classes) in enumerate(classes_dict.items()):
-        print(f"Processing file {i+1}/{c1}")
-        source_w = source.replace(dir+"/", "").replace("."+formats, ".md")
+        print(f"Processing file {i + 1}/{c1}")
+        source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
        subfolders = "/".join(source_w.split("/")[:-1])
        Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
        for name, function_names in classes.items():
-            print(f"Processing Class {i+1}/{c1}")
+            print(f"Processing Class {i + 1}/{c1}")
            prompt = PromptTemplate(
                input_variables=["class_name", "functions_names"],
                template="Class name: {class_name} \nFunctions: {functions_names}, \nDocumentation: ",
@ -92,6 +99,7 @@ def parse_classes(classes_dict, formats, dir):
            with open(f"outputs/{source_w}", "a" if Path(f"outputs/{source_w}").exists() else "w") as f:
                f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}")

+
 def transform_to_docs(functions_dict, classes_dict, formats, dir):
    docs_content = ''.join([str(key) + str(value) for key, value in functions_dict.items()])
    docs_content += ''.join([str(key) + str(value) for key, value in classes_dict.items()])
@ -110,4 +118,4 @@ def transform_to_docs(functions_dict, classes_dict, formats, dir):
        parse_classes(classes_dict, formats, dir)
        print("All done!")
    else:
-        print("The API was not called. No money was spent.")
+        print("The API was not called. No money was spent.")
--- a/application/parser/schema/base.py
+++ b/application/parser/schema/base.py
@ -2,7 +2,6 @@
 from dataclasses import dataclass

 from langchain.docstore.document import Document as LCDocument
-
 from parser.schema.schema import BaseDocument


--- a/application/parser/token_func.py
+++ b/application/parser/token_func.py
@ -1,9 +1,9 @@
 import re
-import tiktoken
-
-from typing import List
-from parser.schema.base import Document
 from math import ceil
+from typing import List
+
+import tiktoken
+from parser.schema.base import Document


 def separate_header_and_body(text):
@ -13,6 +13,7 @@ def separate_header_and_body(text):
    body = text[len(header):]
    return header, body

+
 def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]:
    docs = []
    current_group = None
@ -23,7 +24,8 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
        if current_group is None:
            current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
                                     extra_info=doc.extra_info)
-        elif len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
+        elif len(tiktoken.get_encoding("cl100k_base").encode(
+                current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
            current_group.text += " " + doc.text
        else:
            docs.append(current_group)
@ -35,6 +37,7 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)

    return docs

+
 def split_documents(documents: List[Document], max_tokens: int) -> List[Document]:
    docs = []
    for doc in documents:
@ -54,17 +57,18 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document
                docs.append(new_doc)
    return docs

+
 def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
-    if token_check == False:
+    if not token_check:
        return documents
    print("Grouping small documents")
    try:
        documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
-    except:
+    except Exception:
        print("Grouping failed, try running without token_check")
    print("Separating large documents")
    try:
        documents = split_documents(documents=documents, max_tokens=max_tokens)
-    except:
+    except Exception:
        print("Grouping failed, try running without token_check")
    return documents
--- a/application/worker.py
+++ b/application/worker.py
@ -1,18 +1,17 @@
-import requests
-import nltk
 import os
-
-from parser.file.bulk import SimpleDirectoryReader
-from parser.schema.base import Document
-from parser.open_ai_func import call_openai_api
-from parser.token_func import group_split
-from urllib.parse import urljoin
-from core.settings import settings
-
-
+import shutil
 import string
 import zipfile
-import shutil
+from urllib.parse import urljoin
+
+import nltk
+import requests
+
+from core.settings import settings
+from parser.file.bulk import SimpleDirectoryReader
+from parser.open_ai_func import call_openai_api
+from parser.schema.base import Document
+from parser.token_func import group_split

 try:
    nltk.download('punkt', quiet=True)
@ -50,7 +49,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
    with open(full_path + '/' + filename, 'wb') as f:
        f.write(file)

-    #check if file is .zip and extract it
+    # check if file is .zip and extract it
    if filename.endswith('.zip'):
        with zipfile.ZipFile(full_path + '/' + filename, 'r') as zip_ref:
            zip_ref.extractall(full_path)
@ -68,7 +67,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
    call_openai_api(docs, full_path, self)
    self.update_state(state='PROGRESS', meta={'current': 100})

-    if sample == True:
+    if sample:
        for i in range(min(5, len(raw_docs))):
            print(raw_docs[i].text)

--- a/application/wsgi.py
+++ b/application/wsgi.py
@ -1,4 +1,4 @@
 from app import app

 if __name__ == "__main__":
-    app.run(debug=True, port=5001)
+    app.run(debug=True, port=5001)
--- a/extensions/chatwoot/app.py
+++ b/extensions/chatwoot/app.py
@ -1,18 +1,20 @@
-import requests
-import dotenv
 import os
-import json
 import pprint

+import dotenv
+import requests
+from flask import Flask, request
+
 dotenv.load_dotenv()
 docsgpt_url = os.getenv("docsgpt_url")
 chatwoot_url = os.getenv("chatwoot_url")
 docsgpt_key = os.getenv("docsgpt_key")
 chatwoot_token = os.getenv("chatwoot_token")
-#account_id = os.getenv("account_id")
-#assignee_id = os.getenv("assignee_id")
+# account_id = os.getenv("account_id")
+# assignee_id = os.getenv("assignee_id")
 label_stop = "human-requested"

+
 def send_to_bot(sender, message):
    data = {
        'sender': sender,
@ -43,7 +45,6 @@ def send_to_chatwoot(account, conversation, message):
    return r.json()


-from flask import Flask, request
 app = Flask(__name__)


@ -74,7 +75,7 @@ def docsgpt():
    # elif str(assignee) != str(assignee_id):
    #     return "Not the right assignee"

-    if(message_type == "incoming"):
+    if (message_type == "incoming"):
        bot_response = send_to_bot(contact, message)
        create_message = send_to_chatwoot(
            account, conversation, bot_response)
@ -83,5 +84,6 @@ def docsgpt():

    return create_message

+
 if __name__ == '__main__':
-    app.run(host='0.0.0.0', port=80)
+    app.run(host='0.0.0.0', port=80)
--- a/scripts/code_docs_gen.py
+++ b/scripts/code_docs_gen.py
@ -1,17 +1,10 @@
-from pathlib import Path
-from langchain.text_splitter import CharacterTextSplitter
-import faiss
-from langchain.vectorstores import FAISS
-from langchain.embeddings import OpenAIEmbeddings
-from langchain.llms import OpenAI
-from langchain.prompts import PromptTemplate
-import pickle
-import dotenv
-import tiktoken
-import sys
-from argparse import ArgumentParser
 import ast
 import json
+from pathlib import Path
+
+import dotenv
+from langchain.llms import OpenAI
+from langchain.prompts import PromptTemplate

 dotenv.load_dotenv()

@ -24,12 +17,6 @@ for p in ps:
    sources.append(p)


-# with open('inputs/client.py', 'r') as f:
-#     tree = ast.parse(f.read())
-
-# print(tree)
-
-
 def get_functions_in_class(node):
    functions = []
    functions_code = []
@ -66,16 +53,6 @@ for code in data:
 with open('structure_dict.json', 'w') as f:
    json.dump(structure_dict, f)

-# llm = OpenAI(temperature=0)
-# prompt = PromptTemplate(
-#     input_variables=["code"],
-#     template="Code: {code}, Documentation: ",
-# )
-#
-# print(prompt.format(code="print('hello world')"))
-# print(llm(prompt.format(code="print('hello world')")))
-
-
 if not Path("outputs").exists():
    Path("outputs").mkdir()

--- a/scripts/ingest.py
+++ b/scripts/ingest.py
@ -1,19 +1,19 @@
 import os
 import sys
-import nltk
-import dotenv
-import typer
-
 from collections import defaultdict
 from typing import List, Optional

+import dotenv
+import nltk
+import typer
+
 from parser.file.bulk import SimpleDirectoryReader
-from parser.schema.base import Document
-from parser.open_ai_func import call_openai_api, get_user_permission
-from parser.py2doc import transform_to_docs
-from parser.py2doc import extract_functions_and_classes as extract_py
-from parser.js2doc import extract_functions_and_classes as extract_js
 from parser.java2doc import extract_functions_and_classes as extract_java
+from parser.js2doc import extract_functions_and_classes as extract_js
+from parser.open_ai_func import call_openai_api, get_user_permission
+from parser.py2doc import extract_functions_and_classes as extract_py
+from parser.py2doc import transform_to_docs
+from parser.schema.base import Document
 from parser.token_func import group_split

 dotenv.load_dotenv()
@ -38,7 +38,8 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
           limit: Optional[int] = typer.Option(None, help="Maximum number of files to read."),
           formats: Optional[List[str]] = typer.Option([".rst", ".md"],
                                                       help="""List of required extensions (list with .)
-                                                        Currently supported: .rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""),
+                                                        Currently supported: 
+                                                        .rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""),
           exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles)."),
           sample: Optional[bool] = typer.Option(False,
                                                 help="Whether to output sample of the first 5 split documents."),
@ -65,7 +66,7 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
        # docs = text_splitter.split_documents(raw_docs)

        # Sample feature
-        if sample == True:
+        if sample:
            for i in range(min(5, len(raw_docs))):
                print(raw_docs[i].text)

--- a/scripts/old/ingest_rst.py
+++ b/scripts/old/ingest_rst.py
@ -1,38 +1,42 @@
-from pathlib import Path
-from langchain.text_splitter import CharacterTextSplitter
-import faiss
-from langchain.vectorstores import FAISS
-from langchain.embeddings import OpenAIEmbeddings
 import pickle
-import dotenv
-import tiktoken
 import sys
 from argparse import ArgumentParser
+from pathlib import Path
+
+import dotenv
+import faiss
+import tiktoken
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.text_splitter import CharacterTextSplitter
+from langchain.vectorstores import FAISS
+

 def num_tokens_from_string(string: str, encoding_name: str) -> int:
-# Function to convert string to tokens and estimate user cost.
+    # Function to convert string to tokens and estimate user cost.
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
-    total_price = ((num_tokens/1000) * 0.0004)
+    total_price = ((num_tokens / 1000) * 0.0004)
    return num_tokens, total_price

+
 def call_openai_api():
-# Function to create a vector store from the documents and save it to disk.
+    # Function to create a vector store from the documents and save it to disk.
    store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
    faiss.write_index(store.index, "docs.index")
    store.index = None
    with open("faiss_store.pkl", "wb") as f:
        pickle.dump(store, f)

+
 def get_user_permission():
-# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
+    # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
    docs_content = (" ".join(docs))
    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
    print(f"Number of Tokens = {format(tokens, ',d')}")
    print(f"Approx Cost = ${format(total_price, ',.2f')}")
-    #Here we check for user permission before calling the API.
+    # Here we check for user permission before calling the API.
    user_input = input("Price Okay? (Y/N) \n").lower()
    if user_input == "y":
        call_openai_api()
@ -41,7 +45,8 @@ def get_user_permission():
    else:
        print("The API was not called. No money was spent.")

-#Load .env file
+
+# Load .env file
 dotenv.load_dotenv()

 ap = ArgumentParser("Script for training DocsGPT on .rst documentation files.")
--- a/scripts/old/ingest_rst_sphinx.py
+++ b/scripts/old/ingest_rst_sphinx.py
@ -1,71 +1,75 @@
 import os
 import pickle
-import dotenv
-import tiktoken
-import sys
-import faiss
 import shutil
+import sys
+from argparse import ArgumentParser
 from pathlib import Path
-from langchain.vectorstores import FAISS
+
+import dotenv
+import faiss
+import tiktoken
 from langchain.embeddings import OpenAIEmbeddings
 from langchain.text_splitter import CharacterTextSplitter
+from langchain.vectorstores import FAISS
 from sphinx.cmd.build import main as sphinx_main
-from argparse import ArgumentParser
+

 def convert_rst_to_txt(src_dir, dst_dir):
-  # Check if the source directory exists
-  if not os.path.exists(src_dir):
-    raise Exception("Source directory does not exist")
-  # Walk through the source directory
-  for root, dirs, files in os.walk(src_dir):
-    for file in files:
-      # Check if the file has .rst extension
-      if file.endswith(".rst"):
-        # Construct the full path of the file
-        src_file = os.path.join(root, file.replace(".rst", ""))
-        # Convert the .rst file to .txt file using sphinx-build
-        args = f". -b text -D extensions=sphinx.ext.autodoc " \
-               f"-D master_doc={src_file} " \
-               f"-D source_suffix=.rst " \
-               f"-C {dst_dir} "
-        sphinx_main(args.split())
-      elif file.endswith(".md"):
-        # Rename the .md file to .rst file
-        src_file = os.path.join(root, file)
-        dst_file = os.path.join(root, file.replace(".md", ".rst"))
-        os.rename(src_file, dst_file)
-        # Convert the .rst file to .txt file using sphinx-build
-        args = f". -b text -D extensions=sphinx.ext.autodoc " \
-                f"-D master_doc={dst_file} " \
-                f"-D source_suffix=.rst " \
-                f"-C {dst_dir} "
-        sphinx_main(args.split())
+    # Check if the source directory exists
+    if not os.path.exists(src_dir):
+        raise Exception("Source directory does not exist")
+    # Walk through the source directory
+    for root, dirs, files in os.walk(src_dir):
+        for file in files:
+            # Check if the file has .rst extension
+            if file.endswith(".rst"):
+                # Construct the full path of the file
+                src_file = os.path.join(root, file.replace(".rst", ""))
+                # Convert the .rst file to .txt file using sphinx-build
+                args = f". -b text -D extensions=sphinx.ext.autodoc " \
+                       f"-D master_doc={src_file} " \
+                       f"-D source_suffix=.rst " \
+                       f"-C {dst_dir} "
+                sphinx_main(args.split())
+            elif file.endswith(".md"):
+                # Rename the .md file to .rst file
+                src_file = os.path.join(root, file)
+                dst_file = os.path.join(root, file.replace(".md", ".rst"))
+                os.rename(src_file, dst_file)
+                # Convert the .rst file to .txt file using sphinx-build
+                args = f". -b text -D extensions=sphinx.ext.autodoc " \
+                       f"-D master_doc={dst_file} " \
+                       f"-D source_suffix=.rst " \
+                       f"-C {dst_dir} "
+                sphinx_main(args.split())


 def num_tokens_from_string(string: str, encoding_name: str) -> int:
-# Function to convert string to tokens and estimate user cost.
+    # Function to convert string to tokens and estimate user cost.
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
-    total_price = ((num_tokens/1000) * 0.0004)
+    total_price = ((num_tokens / 1000) * 0.0004)
    return num_tokens, total_price

+
 def call_openai_api():
-# Function to create a vector store from the documents and save it to disk.
+    # Function to create a vector store from the documents and save it to disk.
    store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
    faiss.write_index(store.index, "docs.index")
    store.index = None
    with open("faiss_store.pkl", "wb") as f:
        pickle.dump(store, f)

+
 def get_user_permission():
-# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
+    # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
    # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
    docs_content = (" ".join(docs))
    tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
    # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
    print(f"Number of Tokens = {format(tokens, ',d')}")
    print(f"Approx Cost = ${format(total_price, ',.2f')}")
-    #Here we check for user permission before calling the API.
+    # Here we check for user permission before calling the API.
    user_input = input("Price Okay? (Y/N) \n").lower()
    if user_input == "y":
        call_openai_api()
@ -74,6 +78,7 @@ def get_user_permission():
    else:
        print("The API was not called. No money was spent.")

+
 ap = ArgumentParser("Script for training DocsGPT on Sphinx documentation")
 ap.add_argument("-i", "--inputs",
                type=str,
@ -81,17 +86,17 @@ ap.add_argument("-i", "--inputs",
                help="Directory containing documentation files")
 args = ap.parse_args()

-#Load .env file
+# Load .env file
 dotenv.load_dotenv()

-#Directory to vector
+# Directory to vector
 src_dir = args.inputs
 dst_dir = "tmp"

 convert_rst_to_txt(src_dir, dst_dir)

 # Here we load in the data in the format that Notion exports it in.
-ps = list(Path("tmp/"+ src_dir).glob("**/*.txt"))
+ps = list(Path("tmp/" + src_dir).glob("**/*.txt"))

 # parse all child directories
 data = []
--- a/scripts/parser/file/base.py
+++ b/scripts/parser/file/base.py
@ -3,7 +3,6 @@ from abc import abstractmethod
 from typing import Any, List

 from langchain.docstore.document import Document as LCDocument
-
 from parser.schema.base import Document


--- a/scripts/parser/file/html_parser.py
+++ b/scripts/parser/file/html_parser.py
@ -24,12 +24,11 @@ class HTMLParser(BaseParser):
            Union[str, List[str]]: a string or a List of strings.
        """
        try:
-            import unstructured
+            from unstructured.partition.html import partition_html
+            from unstructured.staging.base import convert_to_isd
+            from unstructured.cleaners.core import clean
        except ImportError:
            raise ValueError("unstructured package is required to parse HTML files.")
-        from unstructured.partition.html import partition_html
-        from unstructured.staging.base import convert_to_isd
-        from unstructured.cleaners.core import clean

        # Using the unstructured library to convert the html to isd format
        # isd sample : isd = [
@ -70,7 +69,8 @@ class HTMLParser(BaseParser):
                Chunks.append([])
            Chunks[-1].append(isd_el['text'])

-        # Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
+        # Removing all the chunks with sum of lenth of all the strings in the chunk < 25
+        # TODO: This value can be a user defined variable
        for chunk in Chunks:
            # sum of lenth of all the strings in the chunk
            sum = 0
--- a/scripts/parser/file/markdown_parser.py
+++ b/scripts/parser/file/markdown_parser.py
@ -7,8 +7,8 @@ import re
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union, cast

-from parser.file.base_parser import BaseParser
 import tiktoken
+from parser.file.base_parser import BaseParser


 class MarkdownParser(BaseParser):
--- a/scripts/parser/file/rst_parser.py
+++ b/scripts/parser/file/rst_parser.py
@ -5,10 +5,9 @@ Contains parser for md files.
 """
 import re
 from pathlib import Path
-from typing import Any, Dict, List, Optional, Tuple, Union, cast
+from typing import Any, Dict, List, Optional, Tuple, Union

 from parser.file.base_parser import BaseParser
-import tiktoken


 class RstParser(BaseParser):
--- a/scripts/parser/java2doc.py
+++ b/scripts/parser/java2doc.py
@ -1,4 +1,5 @@
 import os
+
 import javalang


--- a/scripts/parser/js2doc.py
+++ b/scripts/parser/js2doc.py
@ -1,6 +1,7 @@
 import os
-import esprima
+
 import escodegen
+import esprima


 def find_files(directory):
@ -27,7 +28,6 @@ def extract_functions(file_path):
                        func_name = declaration.id.name if declaration.id else '<anonymous>'
                        functions[func_name] = escodegen.generate(declaration.init)
            elif node.type == 'ClassDeclaration':
-                class_name = node.id.name
                for subnode in node.body.body:
                    if subnode.type == 'MethodDefinition':
                        func_name = subnode.key.name
--- a/scripts/parser/open_ai_func.py
+++ b/scripts/parser/open_ai_func.py
@ -1,16 +1,15 @@
 import os
-import faiss
-import pickle
+
 import tiktoken
-from langchain.vectorstores import FAISS
 from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores import FAISS
+from retry import retry
+

 # from langchain.embeddings import HuggingFaceEmbeddings
 # from langchain.embeddings import HuggingFaceInstructEmbeddings
 # from langchain.embeddings import CohereEmbeddings

-from retry import retry
-

 def num_tokens_from_string(string: str, encoding_name: str) -> int:
    # Function to convert string to tokens and estimate user cost.
--- a/scripts/parser/py2doc.py
+++ b/scripts/parser/py2doc.py
@ -1,7 +1,8 @@
-import os
 import ast
-import tiktoken
+import os
 from pathlib import Path
+
+import tiktoken
 from langchain.llms import OpenAI
 from langchain.prompts import PromptTemplate

--- a/scripts/parser/schema/base.py
+++ b/scripts/parser/schema/base.py
@ -2,7 +2,6 @@
 from dataclasses import dataclass

 from langchain.docstore.document import Document as LCDocument
-
 from parser.schema.schema import BaseDocument


--- a/scripts/parser/token_func.py
+++ b/scripts/parser/token_func.py
@ -1,9 +1,9 @@
 import re
-import tiktoken
-
-from typing import List
-from parser.schema.base import Document
 from math import ceil
+from typing import List
+
+import tiktoken
+from parser.schema.base import Document


 def separate_header_and_body(text):
@ -59,16 +59,16 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document


 def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
-    if token_check == False:
+    if not token_check:
        return documents
    print("Grouping small documents")
    try:
        documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
-    except:
+    except Exception:
        print("Grouping failed, try running without token_check")
    print("Separating large documents")
    try:
        documents = split_documents(documents=documents, max_tokens=max_tokens)
-    except:
+    except Exception:
        print("Grouping failed, try running without token_check")
    return documents