diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 92cc718b..50c48d5e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,7 +23,7 @@ jobs: with: username: ${{ secrets.DOCKER_USERNAME }} password: ${{ secrets.DOCKER_PASSWORD }} - + - name: Login to ghcr.io uses: docker/login-action@v2 with: diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 00000000..7ee31ebe --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,17 @@ +name: Python linting + +on: + push: + branches: + - '*' + pull_request: + types: [ opened, synchronize ] + +jobs: + ruff: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - name: Lint with Ruff + uses: chartboost/ruff-action@v1 diff --git a/.ruff.toml b/.ruff.toml new file mode 100644 index 00000000..857f8153 --- /dev/null +++ b/.ruff.toml @@ -0,0 +1,2 @@ +# Allow lines to be as long as 120 characters. +line-length = 120 \ No newline at end of file diff --git a/application/app.py b/application/app.py index 19d5f0c3..d68c5b93 100644 --- a/application/app.py +++ b/application/app.py @@ -1,8 +1,9 @@ +import asyncio import datetime +import http.client import json import os import traceback -import asyncio import dotenv import requests @@ -26,10 +27,9 @@ from langchain.prompts.chat import ( from pymongo import MongoClient from werkzeug.utils import secure_filename +from core.settings import settings from error import bad_request from worker import ingest_worker -from core.settings import settings -import celeryconfig # os.environ["LANGCHAIN_HANDLER"] = "langchain" @@ -177,18 +177,12 @@ def api_answer(): q_prompt = PromptTemplate(input_variables=["context", "question"], template=template_quest, template_format="jinja2") if settings.LLM_NAME == "openai_chat": - # llm = ChatOpenAI(openai_api_key=api_key, model_name="gpt-4") - llm = ChatOpenAI(openai_api_key=api_key) + llm = ChatOpenAI(openai_api_key=api_key) # optional parameter: model_name="gpt-4" messages_combine = [ SystemMessagePromptTemplate.from_template(chat_combine_template), HumanMessagePromptTemplate.from_template("{question}") ] p_chat_combine = ChatPromptTemplate.from_messages(messages_combine) - messages_reduce = [ - SystemMessagePromptTemplate.from_template(chat_reduce_template), - HumanMessagePromptTemplate.from_template("{question}") - ] - p_chat_reduce = ChatPromptTemplate.from_messages(messages_reduce) elif settings.LLM_NAME == "openai": llm = OpenAI(openai_api_key=api_key, temperature=0) elif settings.LLM_NAME == "manifest": @@ -226,7 +220,7 @@ def api_answer(): result['answer'] = result['answer'].replace("\\n", "\n") try: result['answer'] = result['answer'].split("SOURCES:")[0] - except: + except Exception: pass # mock result @@ -295,7 +289,7 @@ def api_feedback(): "feedback": feedback }) ) - return {"status": 'ok'} + return {"status": http.client.responses.get(response.status_code, 'ok')} @app.route('/api/combine', methods=['GET']) diff --git a/application/celeryconfig.py b/application/celeryconfig.py index efa10f07..712b3bfc 100644 --- a/application/celeryconfig.py +++ b/application/celeryconfig.py @@ -1,7 +1,8 @@ import os + broker_url = os.getenv("CELERY_BROKER_URL") result_backend = os.getenv("CELERY_RESULT_BACKEND") task_serializer = 'json' result_serializer = 'json' -accept_content = ['json'] \ No newline at end of file +accept_content = ['json'] diff --git a/application/core/settings.py b/application/core/settings.py index 8c3ccb82..fa654ed5 100644 --- a/application/core/settings.py +++ b/application/core/settings.py @@ -1,6 +1,7 @@ -from pydantic import BaseSettings from pathlib import Path +from pydantic import BaseSettings + class Settings(BaseSettings): LLM_NAME: str = "openai_chat" diff --git a/application/error.py b/application/error.py index cab5ea39..5d42f0ee 100644 --- a/application/error.py +++ b/application/error.py @@ -1,13 +1,15 @@ from flask import jsonify from werkzeug.http import HTTP_STATUS_CODES -def response_error(code_status,message=None): - payload = {'error':HTTP_STATUS_CODES.get(code_status,"something went wrong")} + +def response_error(code_status, message=None): + payload = {'error': HTTP_STATUS_CODES.get(code_status, "something went wrong")} if message: payload['message'] = message response = jsonify(payload) response.status_code = code_status return response -def bad_request(status_code=400,message=''): - return response_error(code_status=status_code,message=message) \ No newline at end of file + +def bad_request(status_code=400, message=''): + return response_error(code_status=status_code, message=message) diff --git a/application/parser/file/base.py b/application/parser/file/base.py index c2777a06..2fe9a75d 100644 --- a/application/parser/file/base.py +++ b/application/parser/file/base.py @@ -3,7 +3,6 @@ from abc import abstractmethod from typing import Any, List from langchain.docstore.document import Document as LCDocument - from parser.schema.base import Document diff --git a/application/parser/file/html_parser.py b/application/parser/file/html_parser.py index 53d7492f..96460c7c 100644 --- a/application/parser/file/html_parser.py +++ b/application/parser/file/html_parser.py @@ -9,6 +9,7 @@ from typing import Dict, Union from parser.file.base_parser import BaseParser + class HTMLParser(BaseParser): """HTML parser.""" @@ -23,38 +24,37 @@ class HTMLParser(BaseParser): Union[str, List[str]]: a string or a List of strings. """ try: - import unstructured + from unstructured.partition.html import partition_html + from unstructured.staging.base import convert_to_isd + from unstructured.cleaners.core import clean except ImportError: raise ValueError("unstructured package is required to parse HTML files.") - from unstructured.partition.html import partition_html - from unstructured.staging.base import convert_to_isd - from unstructured.cleaners.core import clean # Using the unstructured library to convert the html to isd format # isd sample : isd = [ - # {"text": "My Title", "type": "Title"}, - # {"text": "My Narrative", "type": "NarrativeText"} - # ] + # {"text": "My Title", "type": "Title"}, + # {"text": "My Narrative", "type": "NarrativeText"} + # ] with open(file, "r", encoding="utf-8") as fp: elements = partition_html(file=fp) - isd = convert_to_isd(elements) + isd = convert_to_isd(elements) - # Removing non ascii charactwers from isd_el['text'] + # Removing non ascii charactwers from isd_el['text'] for isd_el in isd: isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode() # Removing all the \n characters from isd_el['text'] using regex and replace with single space # Removing all the extra spaces from isd_el['text'] using regex and replace with single space for isd_el in isd: - isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL) - isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL) + isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL) + isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL) # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation for isd_el in isd: - clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True ) + clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True) # Creating a list of all the indexes of isd_el['type'] = 'Title' - title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title'] + title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title'] # Creating 'Chunks' - List of lists of strings # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title' @@ -64,19 +64,20 @@ class HTMLParser(BaseParser): Chunks = [[]] final_chunks = list(list()) - for i,isd_el in enumerate(isd): + for i, isd_el in enumerate(isd): if i in title_indexes: Chunks.append([]) Chunks[-1].append(isd_el['text']) - # Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable + # Removing all the chunks with sum of lenth of all the strings in the chunk < 25 + # TODO: This value can be an user defined variable for chunk in Chunks: # sum of lenth of all the strings in the chunk sum = 0 sum += len(str(chunk)) if sum < 25: Chunks.remove(chunk) - else : + else: # appending all the approved chunks to final_chunks as a single string final_chunks.append(" ".join([str(item) for item in chunk])) return final_chunks diff --git a/application/parser/file/markdown_parser.py b/application/parser/file/markdown_parser.py index 2dd9e430..d8aeb3b0 100644 --- a/application/parser/file/markdown_parser.py +++ b/application/parser/file/markdown_parser.py @@ -7,8 +7,8 @@ import re from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union, cast -from parser.file.base_parser import BaseParser import tiktoken +from parser.file.base_parser import BaseParser class MarkdownParser(BaseParser): @@ -20,13 +20,13 @@ class MarkdownParser(BaseParser): """ def __init__( - self, - *args: Any, - remove_hyperlinks: bool = True, - remove_images: bool = True, - max_tokens: int = 2048, - # remove_tables: bool = True, - **kwargs: Any, + self, + *args: Any, + remove_hyperlinks: bool = True, + remove_images: bool = True, + max_tokens: int = 2048, + # remove_tables: bool = True, + **kwargs: Any, ) -> None: """Init params.""" super().__init__(*args, **kwargs) @@ -35,8 +35,8 @@ class MarkdownParser(BaseParser): self._max_tokens = max_tokens # self._remove_tables = remove_tables - - def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str): + def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], + current_text: str): """Append to tups chunk.""" num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text)) if num_tokens > self._max_tokens: @@ -46,6 +46,7 @@ class MarkdownParser(BaseParser): else: tups.append((current_header, current_text)) return tups + def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]: """Convert a markdown file to a dictionary. @@ -115,7 +116,7 @@ class MarkdownParser(BaseParser): return {} def parse_tups( - self, filepath: Path, errors: str = "ignore" + self, filepath: Path, errors: str = "ignore" ) -> List[Tuple[Optional[str], str]]: """Parse file into tuples.""" with open(filepath, "r") as f: @@ -130,7 +131,7 @@ class MarkdownParser(BaseParser): return markdown_tups def parse_file( - self, filepath: Path, errors: str = "ignore" + self, filepath: Path, errors: str = "ignore" ) -> Union[str, List[str]]: """Parse file into string.""" tups = self.parse_tups(filepath, errors=errors) diff --git a/application/parser/file/rst_parser.py b/application/parser/file/rst_parser.py index 1719b84c..f8feff70 100644 --- a/application/parser/file/rst_parser.py +++ b/application/parser/file/rst_parser.py @@ -5,10 +5,10 @@ Contains parser for md files. """ import re from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union, cast +from typing import Any, Dict, List, Optional, Tuple, Union from parser.file.base_parser import BaseParser -import tiktoken + class RstParser(BaseParser): """reStructuredText parser. @@ -19,17 +19,17 @@ class RstParser(BaseParser): """ def __init__( - self, - *args: Any, - remove_hyperlinks: bool = True, - remove_images: bool = True, - remove_table_excess: bool = True, - remove_interpreters: bool = True, - remove_directives: bool = True, - remove_whitespaces_excess: bool = True, - #Be carefull with remove_characters_excess, might cause data loss - remove_characters_excess: bool = True, - **kwargs: Any, + self, + *args: Any, + remove_hyperlinks: bool = True, + remove_images: bool = True, + remove_table_excess: bool = True, + remove_interpreters: bool = True, + remove_directives: bool = True, + remove_whitespaces_excess: bool = True, + # Be carefull with remove_characters_excess, might cause data loss + remove_characters_excess: bool = True, + **kwargs: Any, ) -> None: """Init params.""" super().__init__(*args, **kwargs) @@ -41,7 +41,6 @@ class RstParser(BaseParser): self._remove_whitespaces_excess = remove_whitespaces_excess self._remove_characters_excess = remove_characters_excess - def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]: """Convert a reStructuredText file to a dictionary. @@ -56,7 +55,8 @@ class RstParser(BaseParser): for i, line in enumerate(lines): header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line) - if header_match and i > 0 and (len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]): + if header_match and i > 0 and ( + len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]): if current_header is not None: if current_text == "" or None: continue @@ -72,7 +72,7 @@ class RstParser(BaseParser): rst_tups.append((current_header, current_text)) - #TODO: Format for rst + # TODO: Format for rst # # if current_header is not None: # # pass linting, assert keys are defined @@ -136,7 +136,7 @@ class RstParser(BaseParser): return {} def parse_tups( - self, filepath: Path, errors: str = "ignore" + self, filepath: Path, errors: str = "ignore" ) -> List[Tuple[Optional[str], str]]: """Parse file into tuples.""" with open(filepath, "r") as f: @@ -159,7 +159,7 @@ class RstParser(BaseParser): return rst_tups def parse_file( - self, filepath: Path, errors: str = "ignore" + self, filepath: Path, errors: str = "ignore" ) -> Union[str, List[str]]: """Parse file into string.""" tups = self.parse_tups(filepath, errors=errors) diff --git a/application/parser/file/tabular_parser.py b/application/parser/file/tabular_parser.py index bbb875e1..d7c6402a 100644 --- a/application/parser/file/tabular_parser.py +++ b/application/parser/file/tabular_parser.py @@ -77,13 +77,13 @@ class PandasCSVParser(BaseParser): """ def __init__( - self, - *args: Any, - concat_rows: bool = True, - col_joiner: str = ", ", - row_joiner: str = "\n", - pandas_config: dict = {}, - **kwargs: Any + self, + *args: Any, + concat_rows: bool = True, + col_joiner: str = ", ", + row_joiner: str = "\n", + pandas_config: dict = {}, + **kwargs: Any ) -> None: """Init params.""" super().__init__(*args, **kwargs) diff --git a/application/parser/java2doc.py b/application/parser/java2doc.py index c1701c5d..2a8bfa3a 100644 --- a/application/parser/java2doc.py +++ b/application/parser/java2doc.py @@ -1,6 +1,8 @@ import os + import javalang + def find_files(directory): files_list = [] for root, dirs, files in os.walk(directory): @@ -9,6 +11,7 @@ def find_files(directory): files_list.append(os.path.join(root, file)) return files_list + def extract_functions(file_path): with open(file_path, "r") as file: java_code = file.read() @@ -28,6 +31,7 @@ def extract_functions(file_path): methods[method_name] = method_source_code return methods + def extract_classes(file_path): with open(file_path, 'r') as file: source_code = file.read() @@ -47,6 +51,7 @@ def extract_classes(file_path): classes[class_name] = class_string return classes + def extract_functions_and_classes(directory): files = find_files(directory) functions_dict = {} @@ -58,4 +63,4 @@ def extract_functions_and_classes(directory): classes = extract_classes(file) if classes: classes_dict[file] = classes - return functions_dict, classes_dict \ No newline at end of file + return functions_dict, classes_dict diff --git a/application/parser/js2doc.py b/application/parser/js2doc.py index d434ab23..6dc44812 100644 --- a/application/parser/js2doc.py +++ b/application/parser/js2doc.py @@ -1,6 +1,7 @@ import os -import esprima + import escodegen +import esprima def find_files(directory): @@ -11,6 +12,7 @@ def find_files(directory): files_list.append(os.path.join(root, file)) return files_list + def extract_functions(file_path): with open(file_path, 'r') as file: source_code = file.read() @@ -26,7 +28,6 @@ def extract_functions(file_path): func_name = declaration.id.name if declaration.id else '' functions[func_name] = escodegen.generate(declaration.init) elif node.type == 'ClassDeclaration': - class_name = node.id.name for subnode in node.body.body: if subnode.type == 'MethodDefinition': func_name = subnode.key.name @@ -38,6 +39,7 @@ def extract_functions(file_path): functions[func_name] = escodegen.generate(declaration.init) return functions + def extract_classes(file_path): with open(file_path, 'r') as file: source_code = file.read() @@ -53,6 +55,7 @@ def extract_classes(file_path): classes[class_name] = ", ".join(function_names) return classes + def extract_functions_and_classes(directory): files = find_files(directory) functions_dict = {} diff --git a/application/parser/open_ai_func.py b/application/parser/open_ai_func.py index c25b2d09..969165d2 100644 --- a/application/parser/open_ai_func.py +++ b/application/parser/open_ai_func.py @@ -1,32 +1,32 @@ import os -import faiss -import pickle + import tiktoken -from langchain.vectorstores import FAISS from langchain.embeddings import OpenAIEmbeddings - -#from langchain.embeddings import HuggingFaceEmbeddings -#from langchain.embeddings import HuggingFaceInstructEmbeddings -#from langchain.embeddings import CohereEmbeddings - +from langchain.vectorstores import FAISS from retry import retry +# from langchain.embeddings import HuggingFaceEmbeddings +# from langchain.embeddings import HuggingFaceInstructEmbeddings +# from langchain.embeddings import CohereEmbeddings + def num_tokens_from_string(string: str, encoding_name: str) -> int: -# Function to convert string to tokens and estimate user cost. + # Function to convert string to tokens and estimate user cost. encoding = tiktoken.get_encoding(encoding_name) num_tokens = len(encoding.encode(string)) - total_price = ((num_tokens/1000) * 0.0004) + total_price = ((num_tokens / 1000) * 0.0004) return num_tokens, total_price + @retry(tries=10, delay=60) def store_add_texts_with_retry(store, i): store.add_texts([i.page_content], metadatas=[i.metadata]) - #store_pine.add_texts([i.page_content], metadatas=[i.metadata]) + # store_pine.add_texts([i.page_content], metadatas=[i.metadata]) + def call_openai_api(docs, folder_name, task_status): -# Function to create a vector store from the documents and save it to disk. + # Function to create a vector store from the documents and save it to disk. # create output folder if it doesn't exist if not os.path.exists(f"{folder_name}"): @@ -44,7 +44,8 @@ def call_openai_api(docs, folder_name, task_status): # hf = HuggingFaceEmbeddings(model_name=model_name) # store = FAISS.from_documents(docs_test, hf) s1 = len(docs) - for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'): + for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs), + bar_format='{l_bar}{bar}| Time Left: {remaining}'): try: task_status.update_state(state='PROGRESS', meta={'current': int((c1 / s1) * 100)}) store_add_texts_with_retry(store, i) @@ -58,20 +59,20 @@ def call_openai_api(docs, folder_name, task_status): c1 += 1 store.save_local(f"{folder_name}") + def get_user_permission(docs, folder_name): -# Function to ask user permission to call the OpenAI api and spend their OpenAI funds. + # Function to ask user permission to call the OpenAI api and spend their OpenAI funds. # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents. - #docs_content = (" ".join(docs)) + # docs_content = (" ".join(docs)) docs_content = "" for doc in docs: docs_content += doc.page_content - tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base") # Here we print the number of tokens and the approx user cost with some visually appealing formatting. print(f"Number of Tokens = {format(tokens, ',d')}") print(f"Approx Cost = ${format(total_price, ',.2f')}") - #Here we check for user permission before calling the API. + # Here we check for user permission before calling the API. user_input = input("Price Okay? (Y/N) \n").lower() if user_input == "y": call_openai_api(docs, folder_name) diff --git a/application/parser/py2doc.py b/application/parser/py2doc.py index 4ac73cd9..3a8175d4 100644 --- a/application/parser/py2doc.py +++ b/application/parser/py2doc.py @@ -1,10 +1,12 @@ -import os import ast -import tiktoken +import os from pathlib import Path + +import tiktoken from langchain.llms import OpenAI from langchain.prompts import PromptTemplate + def find_files(directory): files_list = [] for root, dirs, files in os.walk(directory): @@ -13,6 +15,7 @@ def find_files(directory): files_list.append(os.path.join(root, file)) return files_list + def extract_functions(file_path): with open(file_path, 'r') as file: source_code = file.read() @@ -25,6 +28,7 @@ def extract_functions(file_path): functions[func_name] = func_def return functions + def extract_classes(file_path): with open(file_path, 'r') as file: source_code = file.read() @@ -40,6 +44,7 @@ def extract_classes(file_path): classes[class_name] = ", ".join(function_names) return classes + def extract_functions_and_classes(directory): files = find_files(directory) functions_dict = {} @@ -53,11 +58,12 @@ def extract_functions_and_classes(directory): classes_dict[file] = classes return functions_dict, classes_dict + def parse_functions(functions_dict, formats, dir): c1 = len(functions_dict) for i, (source, functions) in enumerate(functions_dict.items(), start=1): print(f"Processing file {i}/{c1}") - source_w = source.replace(dir+"/", "").replace("."+formats, ".md") + source_w = source.replace(dir + "/", "").replace("." + formats, ".md") subfolders = "/".join(source_w.split("/")[:-1]) Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True) for j, (name, function) in enumerate(functions.items(), start=1): @@ -70,18 +76,19 @@ def parse_functions(functions_dict, formats, dir): response = llm(prompt.format(code=function)) mode = "a" if Path(f"outputs/{source_w}").exists() else "w" with open(f"outputs/{source_w}", mode) as f: - f.write(f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}") + f.write( + f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}") def parse_classes(classes_dict, formats, dir): c1 = len(classes_dict) for i, (source, classes) in enumerate(classes_dict.items()): - print(f"Processing file {i+1}/{c1}") - source_w = source.replace(dir+"/", "").replace("."+formats, ".md") + print(f"Processing file {i + 1}/{c1}") + source_w = source.replace(dir + "/", "").replace("." + formats, ".md") subfolders = "/".join(source_w.split("/")[:-1]) Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True) for name, function_names in classes.items(): - print(f"Processing Class {i+1}/{c1}") + print(f"Processing Class {i + 1}/{c1}") prompt = PromptTemplate( input_variables=["class_name", "functions_names"], template="Class name: {class_name} \nFunctions: {functions_names}, \nDocumentation: ", @@ -92,6 +99,7 @@ def parse_classes(classes_dict, formats, dir): with open(f"outputs/{source_w}", "a" if Path(f"outputs/{source_w}").exists() else "w") as f: f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}") + def transform_to_docs(functions_dict, classes_dict, formats, dir): docs_content = ''.join([str(key) + str(value) for key, value in functions_dict.items()]) docs_content += ''.join([str(key) + str(value) for key, value in classes_dict.items()]) @@ -110,4 +118,4 @@ def transform_to_docs(functions_dict, classes_dict, formats, dir): parse_classes(classes_dict, formats, dir) print("All done!") else: - print("The API was not called. No money was spent.") \ No newline at end of file + print("The API was not called. No money was spent.") diff --git a/application/parser/schema/base.py b/application/parser/schema/base.py index 0871f88f..3dafda1a 100644 --- a/application/parser/schema/base.py +++ b/application/parser/schema/base.py @@ -2,7 +2,6 @@ from dataclasses import dataclass from langchain.docstore.document import Document as LCDocument - from parser.schema.schema import BaseDocument diff --git a/application/parser/token_func.py b/application/parser/token_func.py index 95b318b9..e77376f5 100644 --- a/application/parser/token_func.py +++ b/application/parser/token_func.py @@ -1,9 +1,9 @@ import re -import tiktoken - -from typing import List -from parser.schema.base import Document from math import ceil +from typing import List + +import tiktoken +from parser.schema.base import Document def separate_header_and_body(text): @@ -13,6 +13,7 @@ def separate_header_and_body(text): body = text[len(header):] return header, body + def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]: docs = [] current_group = None @@ -23,7 +24,8 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) if current_group is None: current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding, extra_info=doc.extra_info) - elif len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens: + elif len(tiktoken.get_encoding("cl100k_base").encode( + current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens: current_group.text += " " + doc.text else: docs.append(current_group) @@ -35,6 +37,7 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) return docs + def split_documents(documents: List[Document], max_tokens: int) -> List[Document]: docs = [] for doc in documents: @@ -54,17 +57,18 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document docs.append(new_doc) return docs + def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True): - if token_check == False: + if not token_check: return documents print("Grouping small documents") try: documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens) - except: + except Exception: print("Grouping failed, try running without token_check") print("Separating large documents") try: documents = split_documents(documents=documents, max_tokens=max_tokens) - except: + except Exception: print("Grouping failed, try running without token_check") return documents diff --git a/application/worker.py b/application/worker.py index 8498bfa3..5e47c8a9 100644 --- a/application/worker.py +++ b/application/worker.py @@ -1,18 +1,17 @@ -import requests -import nltk import os - -from parser.file.bulk import SimpleDirectoryReader -from parser.schema.base import Document -from parser.open_ai_func import call_openai_api -from parser.token_func import group_split -from urllib.parse import urljoin -from core.settings import settings - - +import shutil import string import zipfile -import shutil +from urllib.parse import urljoin + +import nltk +import requests + +from core.settings import settings +from parser.file.bulk import SimpleDirectoryReader +from parser.open_ai_func import call_openai_api +from parser.schema.base import Document +from parser.token_func import group_split try: nltk.download('punkt', quiet=True) @@ -50,7 +49,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user): with open(full_path + '/' + filename, 'wb') as f: f.write(file) - #check if file is .zip and extract it + # check if file is .zip and extract it if filename.endswith('.zip'): with zipfile.ZipFile(full_path + '/' + filename, 'r') as zip_ref: zip_ref.extractall(full_path) @@ -68,7 +67,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user): call_openai_api(docs, full_path, self) self.update_state(state='PROGRESS', meta={'current': 100}) - if sample == True: + if sample: for i in range(min(5, len(raw_docs))): print(raw_docs[i].text) diff --git a/application/wsgi.py b/application/wsgi.py index 05ee157f..a65465ee 100644 --- a/application/wsgi.py +++ b/application/wsgi.py @@ -1,4 +1,4 @@ from app import app if __name__ == "__main__": - app.run(debug=True, port=5001) \ No newline at end of file + app.run(debug=True, port=5001) diff --git a/extensions/chatwoot/app.py b/extensions/chatwoot/app.py index f1830041..8580902b 100644 --- a/extensions/chatwoot/app.py +++ b/extensions/chatwoot/app.py @@ -1,18 +1,20 @@ -import requests -import dotenv import os -import json import pprint +import dotenv +import requests +from flask import Flask, request + dotenv.load_dotenv() docsgpt_url = os.getenv("docsgpt_url") chatwoot_url = os.getenv("chatwoot_url") docsgpt_key = os.getenv("docsgpt_key") chatwoot_token = os.getenv("chatwoot_token") -#account_id = os.getenv("account_id") -#assignee_id = os.getenv("assignee_id") +# account_id = os.getenv("account_id") +# assignee_id = os.getenv("assignee_id") label_stop = "human-requested" + def send_to_bot(sender, message): data = { 'sender': sender, @@ -43,7 +45,6 @@ def send_to_chatwoot(account, conversation, message): return r.json() -from flask import Flask, request app = Flask(__name__) @@ -74,7 +75,7 @@ def docsgpt(): # elif str(assignee) != str(assignee_id): # return "Not the right assignee" - if(message_type == "incoming"): + if (message_type == "incoming"): bot_response = send_to_bot(contact, message) create_message = send_to_chatwoot( account, conversation, bot_response) @@ -83,5 +84,6 @@ def docsgpt(): return create_message + if __name__ == '__main__': - app.run(host='0.0.0.0', port=80) \ No newline at end of file + app.run(host='0.0.0.0', port=80) diff --git a/scripts/code_docs_gen.py b/scripts/code_docs_gen.py index 3b057506..50edf3f4 100644 --- a/scripts/code_docs_gen.py +++ b/scripts/code_docs_gen.py @@ -1,17 +1,10 @@ -from pathlib import Path -from langchain.text_splitter import CharacterTextSplitter -import faiss -from langchain.vectorstores import FAISS -from langchain.embeddings import OpenAIEmbeddings -from langchain.llms import OpenAI -from langchain.prompts import PromptTemplate -import pickle -import dotenv -import tiktoken -import sys -from argparse import ArgumentParser import ast import json +from pathlib import Path + +import dotenv +from langchain.llms import OpenAI +from langchain.prompts import PromptTemplate dotenv.load_dotenv() @@ -24,12 +17,6 @@ for p in ps: sources.append(p) -# with open('inputs/client.py', 'r') as f: -# tree = ast.parse(f.read()) - -# print(tree) - - def get_functions_in_class(node): functions = [] functions_code = [] @@ -66,16 +53,6 @@ for code in data: with open('structure_dict.json', 'w') as f: json.dump(structure_dict, f) -# llm = OpenAI(temperature=0) -# prompt = PromptTemplate( -# input_variables=["code"], -# template="Code: {code}, Documentation: ", -# ) -# -# print(prompt.format(code="print('hello world')")) -# print(llm(prompt.format(code="print('hello world')"))) - - if not Path("outputs").exists(): Path("outputs").mkdir() diff --git a/scripts/ingest.py b/scripts/ingest.py index 72f497f3..1aa27565 100644 --- a/scripts/ingest.py +++ b/scripts/ingest.py @@ -1,19 +1,19 @@ import os import sys -import nltk -import dotenv -import typer - from collections import defaultdict from typing import List, Optional +import dotenv +import nltk +import typer + from parser.file.bulk import SimpleDirectoryReader -from parser.schema.base import Document -from parser.open_ai_func import call_openai_api, get_user_permission -from parser.py2doc import transform_to_docs -from parser.py2doc import extract_functions_and_classes as extract_py -from parser.js2doc import extract_functions_and_classes as extract_js from parser.java2doc import extract_functions_and_classes as extract_java +from parser.js2doc import extract_functions_and_classes as extract_js +from parser.open_ai_func import call_openai_api, get_user_permission +from parser.py2doc import extract_functions_and_classes as extract_py +from parser.py2doc import transform_to_docs +from parser.schema.base import Document from parser.token_func import group_split dotenv.load_dotenv() @@ -38,7 +38,8 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False, limit: Optional[int] = typer.Option(None, help="Maximum number of files to read."), formats: Optional[List[str]] = typer.Option([".rst", ".md"], help="""List of required extensions (list with .) - Currently supported: .rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""), + Currently supported: + .rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""), exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles)."), sample: Optional[bool] = typer.Option(False, help="Whether to output sample of the first 5 split documents."), @@ -65,7 +66,7 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False, # docs = text_splitter.split_documents(raw_docs) # Sample feature - if sample == True: + if sample: for i in range(min(5, len(raw_docs))): print(raw_docs[i].text) diff --git a/scripts/old/ingest_rst.py b/scripts/old/ingest_rst.py index 7912cf75..d086ae75 100644 --- a/scripts/old/ingest_rst.py +++ b/scripts/old/ingest_rst.py @@ -1,38 +1,42 @@ -from pathlib import Path -from langchain.text_splitter import CharacterTextSplitter -import faiss -from langchain.vectorstores import FAISS -from langchain.embeddings import OpenAIEmbeddings import pickle -import dotenv -import tiktoken import sys from argparse import ArgumentParser +from pathlib import Path + +import dotenv +import faiss +import tiktoken +from langchain.embeddings import OpenAIEmbeddings +from langchain.text_splitter import CharacterTextSplitter +from langchain.vectorstores import FAISS + def num_tokens_from_string(string: str, encoding_name: str) -> int: -# Function to convert string to tokens and estimate user cost. + # Function to convert string to tokens and estimate user cost. encoding = tiktoken.get_encoding(encoding_name) num_tokens = len(encoding.encode(string)) - total_price = ((num_tokens/1000) * 0.0004) + total_price = ((num_tokens / 1000) * 0.0004) return num_tokens, total_price + def call_openai_api(): -# Function to create a vector store from the documents and save it to disk. + # Function to create a vector store from the documents and save it to disk. store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas) faiss.write_index(store.index, "docs.index") store.index = None with open("faiss_store.pkl", "wb") as f: pickle.dump(store, f) + def get_user_permission(): -# Function to ask user permission to call the OpenAI api and spend their OpenAI funds. + # Function to ask user permission to call the OpenAI api and spend their OpenAI funds. # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents. docs_content = (" ".join(docs)) tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base") # Here we print the number of tokens and the approx user cost with some visually appealing formatting. print(f"Number of Tokens = {format(tokens, ',d')}") print(f"Approx Cost = ${format(total_price, ',.2f')}") - #Here we check for user permission before calling the API. + # Here we check for user permission before calling the API. user_input = input("Price Okay? (Y/N) \n").lower() if user_input == "y": call_openai_api() @@ -41,7 +45,8 @@ def get_user_permission(): else: print("The API was not called. No money was spent.") -#Load .env file + +# Load .env file dotenv.load_dotenv() ap = ArgumentParser("Script for training DocsGPT on .rst documentation files.") diff --git a/scripts/old/ingest_rst_sphinx.py b/scripts/old/ingest_rst_sphinx.py index ecc71570..132cb688 100644 --- a/scripts/old/ingest_rst_sphinx.py +++ b/scripts/old/ingest_rst_sphinx.py @@ -1,71 +1,75 @@ import os import pickle -import dotenv -import tiktoken -import sys -import faiss import shutil +import sys +from argparse import ArgumentParser from pathlib import Path -from langchain.vectorstores import FAISS + +import dotenv +import faiss +import tiktoken from langchain.embeddings import OpenAIEmbeddings from langchain.text_splitter import CharacterTextSplitter +from langchain.vectorstores import FAISS from sphinx.cmd.build import main as sphinx_main -from argparse import ArgumentParser + def convert_rst_to_txt(src_dir, dst_dir): - # Check if the source directory exists - if not os.path.exists(src_dir): - raise Exception("Source directory does not exist") - # Walk through the source directory - for root, dirs, files in os.walk(src_dir): - for file in files: - # Check if the file has .rst extension - if file.endswith(".rst"): - # Construct the full path of the file - src_file = os.path.join(root, file.replace(".rst", "")) - # Convert the .rst file to .txt file using sphinx-build - args = f". -b text -D extensions=sphinx.ext.autodoc " \ - f"-D master_doc={src_file} " \ - f"-D source_suffix=.rst " \ - f"-C {dst_dir} " - sphinx_main(args.split()) - elif file.endswith(".md"): - # Rename the .md file to .rst file - src_file = os.path.join(root, file) - dst_file = os.path.join(root, file.replace(".md", ".rst")) - os.rename(src_file, dst_file) - # Convert the .rst file to .txt file using sphinx-build - args = f". -b text -D extensions=sphinx.ext.autodoc " \ - f"-D master_doc={dst_file} " \ - f"-D source_suffix=.rst " \ - f"-C {dst_dir} " - sphinx_main(args.split()) + # Check if the source directory exists + if not os.path.exists(src_dir): + raise Exception("Source directory does not exist") + # Walk through the source directory + for root, dirs, files in os.walk(src_dir): + for file in files: + # Check if the file has .rst extension + if file.endswith(".rst"): + # Construct the full path of the file + src_file = os.path.join(root, file.replace(".rst", "")) + # Convert the .rst file to .txt file using sphinx-build + args = f". -b text -D extensions=sphinx.ext.autodoc " \ + f"-D master_doc={src_file} " \ + f"-D source_suffix=.rst " \ + f"-C {dst_dir} " + sphinx_main(args.split()) + elif file.endswith(".md"): + # Rename the .md file to .rst file + src_file = os.path.join(root, file) + dst_file = os.path.join(root, file.replace(".md", ".rst")) + os.rename(src_file, dst_file) + # Convert the .rst file to .txt file using sphinx-build + args = f". -b text -D extensions=sphinx.ext.autodoc " \ + f"-D master_doc={dst_file} " \ + f"-D source_suffix=.rst " \ + f"-C {dst_dir} " + sphinx_main(args.split()) def num_tokens_from_string(string: str, encoding_name: str) -> int: -# Function to convert string to tokens and estimate user cost. + # Function to convert string to tokens and estimate user cost. encoding = tiktoken.get_encoding(encoding_name) num_tokens = len(encoding.encode(string)) - total_price = ((num_tokens/1000) * 0.0004) + total_price = ((num_tokens / 1000) * 0.0004) return num_tokens, total_price + def call_openai_api(): -# Function to create a vector store from the documents and save it to disk. + # Function to create a vector store from the documents and save it to disk. store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas) faiss.write_index(store.index, "docs.index") store.index = None with open("faiss_store.pkl", "wb") as f: pickle.dump(store, f) + def get_user_permission(): -# Function to ask user permission to call the OpenAI api and spend their OpenAI funds. + # Function to ask user permission to call the OpenAI api and spend their OpenAI funds. # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents. docs_content = (" ".join(docs)) tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base") # Here we print the number of tokens and the approx user cost with some visually appealing formatting. print(f"Number of Tokens = {format(tokens, ',d')}") print(f"Approx Cost = ${format(total_price, ',.2f')}") - #Here we check for user permission before calling the API. + # Here we check for user permission before calling the API. user_input = input("Price Okay? (Y/N) \n").lower() if user_input == "y": call_openai_api() @@ -74,6 +78,7 @@ def get_user_permission(): else: print("The API was not called. No money was spent.") + ap = ArgumentParser("Script for training DocsGPT on Sphinx documentation") ap.add_argument("-i", "--inputs", type=str, @@ -81,17 +86,17 @@ ap.add_argument("-i", "--inputs", help="Directory containing documentation files") args = ap.parse_args() -#Load .env file +# Load .env file dotenv.load_dotenv() -#Directory to vector +# Directory to vector src_dir = args.inputs dst_dir = "tmp" convert_rst_to_txt(src_dir, dst_dir) # Here we load in the data in the format that Notion exports it in. -ps = list(Path("tmp/"+ src_dir).glob("**/*.txt")) +ps = list(Path("tmp/" + src_dir).glob("**/*.txt")) # parse all child directories data = [] diff --git a/scripts/parser/file/base.py b/scripts/parser/file/base.py index c2777a06..2fe9a75d 100644 --- a/scripts/parser/file/base.py +++ b/scripts/parser/file/base.py @@ -3,7 +3,6 @@ from abc import abstractmethod from typing import Any, List from langchain.docstore.document import Document as LCDocument - from parser.schema.base import Document diff --git a/scripts/parser/file/html_parser.py b/scripts/parser/file/html_parser.py index 73ce97d3..523d09ec 100644 --- a/scripts/parser/file/html_parser.py +++ b/scripts/parser/file/html_parser.py @@ -24,12 +24,11 @@ class HTMLParser(BaseParser): Union[str, List[str]]: a string or a List of strings. """ try: - import unstructured + from unstructured.partition.html import partition_html + from unstructured.staging.base import convert_to_isd + from unstructured.cleaners.core import clean except ImportError: raise ValueError("unstructured package is required to parse HTML files.") - from unstructured.partition.html import partition_html - from unstructured.staging.base import convert_to_isd - from unstructured.cleaners.core import clean # Using the unstructured library to convert the html to isd format # isd sample : isd = [ @@ -70,7 +69,8 @@ class HTMLParser(BaseParser): Chunks.append([]) Chunks[-1].append(isd_el['text']) - # Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable + # Removing all the chunks with sum of lenth of all the strings in the chunk < 25 + # TODO: This value can be a user defined variable for chunk in Chunks: # sum of lenth of all the strings in the chunk sum = 0 diff --git a/scripts/parser/file/markdown_parser.py b/scripts/parser/file/markdown_parser.py index 0b767a63..d8aeb3b0 100644 --- a/scripts/parser/file/markdown_parser.py +++ b/scripts/parser/file/markdown_parser.py @@ -7,8 +7,8 @@ import re from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Union, cast -from parser.file.base_parser import BaseParser import tiktoken +from parser.file.base_parser import BaseParser class MarkdownParser(BaseParser): diff --git a/scripts/parser/file/rst_parser.py b/scripts/parser/file/rst_parser.py index 4e8fd630..f8feff70 100644 --- a/scripts/parser/file/rst_parser.py +++ b/scripts/parser/file/rst_parser.py @@ -5,10 +5,9 @@ Contains parser for md files. """ import re from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union, cast +from typing import Any, Dict, List, Optional, Tuple, Union from parser.file.base_parser import BaseParser -import tiktoken class RstParser(BaseParser): diff --git a/scripts/parser/java2doc.py b/scripts/parser/java2doc.py index 7f97750d..2a8bfa3a 100644 --- a/scripts/parser/java2doc.py +++ b/scripts/parser/java2doc.py @@ -1,4 +1,5 @@ import os + import javalang diff --git a/scripts/parser/js2doc.py b/scripts/parser/js2doc.py index 3c99a0a6..6dc44812 100644 --- a/scripts/parser/js2doc.py +++ b/scripts/parser/js2doc.py @@ -1,6 +1,7 @@ import os -import esprima + import escodegen +import esprima def find_files(directory): @@ -27,7 +28,6 @@ def extract_functions(file_path): func_name = declaration.id.name if declaration.id else '' functions[func_name] = escodegen.generate(declaration.init) elif node.type == 'ClassDeclaration': - class_name = node.id.name for subnode in node.body.body: if subnode.type == 'MethodDefinition': func_name = subnode.key.name diff --git a/scripts/parser/open_ai_func.py b/scripts/parser/open_ai_func.py index ef6ea597..1a95ba93 100644 --- a/scripts/parser/open_ai_func.py +++ b/scripts/parser/open_ai_func.py @@ -1,16 +1,15 @@ import os -import faiss -import pickle + import tiktoken -from langchain.vectorstores import FAISS from langchain.embeddings import OpenAIEmbeddings +from langchain.vectorstores import FAISS +from retry import retry + # from langchain.embeddings import HuggingFaceEmbeddings # from langchain.embeddings import HuggingFaceInstructEmbeddings # from langchain.embeddings import CohereEmbeddings -from retry import retry - def num_tokens_from_string(string: str, encoding_name: str) -> int: # Function to convert string to tokens and estimate user cost. diff --git a/scripts/parser/py2doc.py b/scripts/parser/py2doc.py index 1443e46e..3a8175d4 100644 --- a/scripts/parser/py2doc.py +++ b/scripts/parser/py2doc.py @@ -1,7 +1,8 @@ -import os import ast -import tiktoken +import os from pathlib import Path + +import tiktoken from langchain.llms import OpenAI from langchain.prompts import PromptTemplate diff --git a/scripts/parser/schema/base.py b/scripts/parser/schema/base.py index 0871f88f..3dafda1a 100644 --- a/scripts/parser/schema/base.py +++ b/scripts/parser/schema/base.py @@ -2,7 +2,6 @@ from dataclasses import dataclass from langchain.docstore.document import Document as LCDocument - from parser.schema.schema import BaseDocument diff --git a/scripts/parser/token_func.py b/scripts/parser/token_func.py index d5435f6b..e77376f5 100644 --- a/scripts/parser/token_func.py +++ b/scripts/parser/token_func.py @@ -1,9 +1,9 @@ import re -import tiktoken - -from typing import List -from parser.schema.base import Document from math import ceil +from typing import List + +import tiktoken +from parser.schema.base import Document def separate_header_and_body(text): @@ -59,16 +59,16 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True): - if token_check == False: + if not token_check: return documents print("Grouping small documents") try: documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens) - except: + except Exception: print("Grouping failed, try running without token_check") print("Separating large documents") try: documents = split_documents(documents=documents, max_tokens=max_tokens) - except: + except Exception: print("Grouping failed, try running without token_check") return documents