* validate python formatting on every build with Ruff
* fix lint warnings
pull/232/head
Anton Larin 1 year ago committed by GitHub
parent 168648e789
commit 962becb9a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -23,7 +23,7 @@ jobs:
with: with:
username: ${{ secrets.DOCKER_USERNAME }} username: ${{ secrets.DOCKER_USERNAME }}
password: ${{ secrets.DOCKER_PASSWORD }} password: ${{ secrets.DOCKER_PASSWORD }}
- name: Login to ghcr.io - name: Login to ghcr.io
uses: docker/login-action@v2 uses: docker/login-action@v2
with: with:

@ -0,0 +1,17 @@
name: Python linting
on:
push:
branches:
- '*'
pull_request:
types: [ opened, synchronize ]
jobs:
ruff:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: Lint with Ruff
uses: chartboost/ruff-action@v1

@ -0,0 +1,2 @@
# Allow lines to be as long as 120 characters.
line-length = 120

@ -1,8 +1,9 @@
import asyncio
import datetime import datetime
import http.client
import json import json
import os import os
import traceback import traceback
import asyncio
import dotenv import dotenv
import requests import requests
@ -26,10 +27,9 @@ from langchain.prompts.chat import (
from pymongo import MongoClient from pymongo import MongoClient
from werkzeug.utils import secure_filename from werkzeug.utils import secure_filename
from core.settings import settings
from error import bad_request from error import bad_request
from worker import ingest_worker from worker import ingest_worker
from core.settings import settings
import celeryconfig
# os.environ["LANGCHAIN_HANDLER"] = "langchain" # os.environ["LANGCHAIN_HANDLER"] = "langchain"
@ -177,18 +177,12 @@ def api_answer():
q_prompt = PromptTemplate(input_variables=["context", "question"], template=template_quest, q_prompt = PromptTemplate(input_variables=["context", "question"], template=template_quest,
template_format="jinja2") template_format="jinja2")
if settings.LLM_NAME == "openai_chat": if settings.LLM_NAME == "openai_chat":
# llm = ChatOpenAI(openai_api_key=api_key, model_name="gpt-4") llm = ChatOpenAI(openai_api_key=api_key) # optional parameter: model_name="gpt-4"
llm = ChatOpenAI(openai_api_key=api_key)
messages_combine = [ messages_combine = [
SystemMessagePromptTemplate.from_template(chat_combine_template), SystemMessagePromptTemplate.from_template(chat_combine_template),
HumanMessagePromptTemplate.from_template("{question}") HumanMessagePromptTemplate.from_template("{question}")
] ]
p_chat_combine = ChatPromptTemplate.from_messages(messages_combine) p_chat_combine = ChatPromptTemplate.from_messages(messages_combine)
messages_reduce = [
SystemMessagePromptTemplate.from_template(chat_reduce_template),
HumanMessagePromptTemplate.from_template("{question}")
]
p_chat_reduce = ChatPromptTemplate.from_messages(messages_reduce)
elif settings.LLM_NAME == "openai": elif settings.LLM_NAME == "openai":
llm = OpenAI(openai_api_key=api_key, temperature=0) llm = OpenAI(openai_api_key=api_key, temperature=0)
elif settings.LLM_NAME == "manifest": elif settings.LLM_NAME == "manifest":
@ -226,7 +220,7 @@ def api_answer():
result['answer'] = result['answer'].replace("\\n", "\n") result['answer'] = result['answer'].replace("\\n", "\n")
try: try:
result['answer'] = result['answer'].split("SOURCES:")[0] result['answer'] = result['answer'].split("SOURCES:")[0]
except: except Exception:
pass pass
# mock result # mock result
@ -295,7 +289,7 @@ def api_feedback():
"feedback": feedback "feedback": feedback
}) })
) )
return {"status": 'ok'} return {"status": http.client.responses.get(response.status_code, 'ok')}
@app.route('/api/combine', methods=['GET']) @app.route('/api/combine', methods=['GET'])

@ -1,7 +1,8 @@
import os import os
broker_url = os.getenv("CELERY_BROKER_URL") broker_url = os.getenv("CELERY_BROKER_URL")
result_backend = os.getenv("CELERY_RESULT_BACKEND") result_backend = os.getenv("CELERY_RESULT_BACKEND")
task_serializer = 'json' task_serializer = 'json'
result_serializer = 'json' result_serializer = 'json'
accept_content = ['json'] accept_content = ['json']

@ -1,6 +1,7 @@
from pydantic import BaseSettings
from pathlib import Path from pathlib import Path
from pydantic import BaseSettings
class Settings(BaseSettings): class Settings(BaseSettings):
LLM_NAME: str = "openai_chat" LLM_NAME: str = "openai_chat"

@ -1,13 +1,15 @@
from flask import jsonify from flask import jsonify
from werkzeug.http import HTTP_STATUS_CODES from werkzeug.http import HTTP_STATUS_CODES
def response_error(code_status,message=None):
payload = {'error':HTTP_STATUS_CODES.get(code_status,"something went wrong")} def response_error(code_status, message=None):
payload = {'error': HTTP_STATUS_CODES.get(code_status, "something went wrong")}
if message: if message:
payload['message'] = message payload['message'] = message
response = jsonify(payload) response = jsonify(payload)
response.status_code = code_status response.status_code = code_status
return response return response
def bad_request(status_code=400,message=''):
return response_error(code_status=status_code,message=message) def bad_request(status_code=400, message=''):
return response_error(code_status=status_code, message=message)

@ -3,7 +3,6 @@ from abc import abstractmethod
from typing import Any, List from typing import Any, List
from langchain.docstore.document import Document as LCDocument from langchain.docstore.document import Document as LCDocument
from parser.schema.base import Document from parser.schema.base import Document

@ -9,6 +9,7 @@ from typing import Dict, Union
from parser.file.base_parser import BaseParser from parser.file.base_parser import BaseParser
class HTMLParser(BaseParser): class HTMLParser(BaseParser):
"""HTML parser.""" """HTML parser."""
@ -23,38 +24,37 @@ class HTMLParser(BaseParser):
Union[str, List[str]]: a string or a List of strings. Union[str, List[str]]: a string or a List of strings.
""" """
try: try:
import unstructured from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
except ImportError: except ImportError:
raise ValueError("unstructured package is required to parse HTML files.") raise ValueError("unstructured package is required to parse HTML files.")
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
# Using the unstructured library to convert the html to isd format # Using the unstructured library to convert the html to isd format
# isd sample : isd = [ # isd sample : isd = [
# {"text": "My Title", "type": "Title"}, # {"text": "My Title", "type": "Title"},
# {"text": "My Narrative", "type": "NarrativeText"} # {"text": "My Narrative", "type": "NarrativeText"}
# ] # ]
with open(file, "r", encoding="utf-8") as fp: with open(file, "r", encoding="utf-8") as fp:
elements = partition_html(file=fp) elements = partition_html(file=fp)
isd = convert_to_isd(elements) isd = convert_to_isd(elements)
# Removing non ascii charactwers from isd_el['text'] # Removing non ascii charactwers from isd_el['text']
for isd_el in isd: for isd_el in isd:
isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode() isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
# Removing all the \n characters from isd_el['text'] using regex and replace with single space # Removing all the \n characters from isd_el['text'] using regex and replace with single space
# Removing all the extra spaces from isd_el['text'] using regex and replace with single space # Removing all the extra spaces from isd_el['text'] using regex and replace with single space
for isd_el in isd: for isd_el in isd:
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL) isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL) isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
# more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
for isd_el in isd: for isd_el in isd:
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True ) clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
# Creating a list of all the indexes of isd_el['type'] = 'Title' # Creating a list of all the indexes of isd_el['type'] = 'Title'
title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title'] title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
# Creating 'Chunks' - List of lists of strings # Creating 'Chunks' - List of lists of strings
# each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title' # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
@ -64,19 +64,20 @@ class HTMLParser(BaseParser):
Chunks = [[]] Chunks = [[]]
final_chunks = list(list()) final_chunks = list(list())
for i,isd_el in enumerate(isd): for i, isd_el in enumerate(isd):
if i in title_indexes: if i in title_indexes:
Chunks.append([]) Chunks.append([])
Chunks[-1].append(isd_el['text']) Chunks[-1].append(isd_el['text'])
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable # Removing all the chunks with sum of lenth of all the strings in the chunk < 25
# TODO: This value can be an user defined variable
for chunk in Chunks: for chunk in Chunks:
# sum of lenth of all the strings in the chunk # sum of lenth of all the strings in the chunk
sum = 0 sum = 0
sum += len(str(chunk)) sum += len(str(chunk))
if sum < 25: if sum < 25:
Chunks.remove(chunk) Chunks.remove(chunk)
else : else:
# appending all the approved chunks to final_chunks as a single string # appending all the approved chunks to final_chunks as a single string
final_chunks.append(" ".join([str(item) for item in chunk])) final_chunks.append(" ".join([str(item) for item in chunk]))
return final_chunks return final_chunks

@ -7,8 +7,8 @@ import re
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast from typing import Any, Dict, List, Optional, Tuple, Union, cast
from parser.file.base_parser import BaseParser
import tiktoken import tiktoken
from parser.file.base_parser import BaseParser
class MarkdownParser(BaseParser): class MarkdownParser(BaseParser):
@ -20,13 +20,13 @@ class MarkdownParser(BaseParser):
""" """
def __init__( def __init__(
self, self,
*args: Any, *args: Any,
remove_hyperlinks: bool = True, remove_hyperlinks: bool = True,
remove_images: bool = True, remove_images: bool = True,
max_tokens: int = 2048, max_tokens: int = 2048,
# remove_tables: bool = True, # remove_tables: bool = True,
**kwargs: Any, **kwargs: Any,
) -> None: ) -> None:
"""Init params.""" """Init params."""
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
@ -35,8 +35,8 @@ class MarkdownParser(BaseParser):
self._max_tokens = max_tokens self._max_tokens = max_tokens
# self._remove_tables = remove_tables # self._remove_tables = remove_tables
def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str],
def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str): current_text: str):
"""Append to tups chunk.""" """Append to tups chunk."""
num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text)) num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
if num_tokens > self._max_tokens: if num_tokens > self._max_tokens:
@ -46,6 +46,7 @@ class MarkdownParser(BaseParser):
else: else:
tups.append((current_header, current_text)) tups.append((current_header, current_text))
return tups return tups
def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]: def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
"""Convert a markdown file to a dictionary. """Convert a markdown file to a dictionary.
@ -115,7 +116,7 @@ class MarkdownParser(BaseParser):
return {} return {}
def parse_tups( def parse_tups(
self, filepath: Path, errors: str = "ignore" self, filepath: Path, errors: str = "ignore"
) -> List[Tuple[Optional[str], str]]: ) -> List[Tuple[Optional[str], str]]:
"""Parse file into tuples.""" """Parse file into tuples."""
with open(filepath, "r") as f: with open(filepath, "r") as f:
@ -130,7 +131,7 @@ class MarkdownParser(BaseParser):
return markdown_tups return markdown_tups
def parse_file( def parse_file(
self, filepath: Path, errors: str = "ignore" self, filepath: Path, errors: str = "ignore"
) -> Union[str, List[str]]: ) -> Union[str, List[str]]:
"""Parse file into string.""" """Parse file into string."""
tups = self.parse_tups(filepath, errors=errors) tups = self.parse_tups(filepath, errors=errors)

@ -5,10 +5,10 @@ Contains parser for md files.
""" """
import re import re
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast from typing import Any, Dict, List, Optional, Tuple, Union
from parser.file.base_parser import BaseParser from parser.file.base_parser import BaseParser
import tiktoken
class RstParser(BaseParser): class RstParser(BaseParser):
"""reStructuredText parser. """reStructuredText parser.
@ -19,17 +19,17 @@ class RstParser(BaseParser):
""" """
def __init__( def __init__(
self, self,
*args: Any, *args: Any,
remove_hyperlinks: bool = True, remove_hyperlinks: bool = True,
remove_images: bool = True, remove_images: bool = True,
remove_table_excess: bool = True, remove_table_excess: bool = True,
remove_interpreters: bool = True, remove_interpreters: bool = True,
remove_directives: bool = True, remove_directives: bool = True,
remove_whitespaces_excess: bool = True, remove_whitespaces_excess: bool = True,
#Be carefull with remove_characters_excess, might cause data loss # Be carefull with remove_characters_excess, might cause data loss
remove_characters_excess: bool = True, remove_characters_excess: bool = True,
**kwargs: Any, **kwargs: Any,
) -> None: ) -> None:
"""Init params.""" """Init params."""
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)
@ -41,7 +41,6 @@ class RstParser(BaseParser):
self._remove_whitespaces_excess = remove_whitespaces_excess self._remove_whitespaces_excess = remove_whitespaces_excess
self._remove_characters_excess = remove_characters_excess self._remove_characters_excess = remove_characters_excess
def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]: def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
"""Convert a reStructuredText file to a dictionary. """Convert a reStructuredText file to a dictionary.
@ -56,7 +55,8 @@ class RstParser(BaseParser):
for i, line in enumerate(lines): for i, line in enumerate(lines):
header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line) header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line)
if header_match and i > 0 and (len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]): if header_match and i > 0 and (
len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
if current_header is not None: if current_header is not None:
if current_text == "" or None: if current_text == "" or None:
continue continue
@ -72,7 +72,7 @@ class RstParser(BaseParser):
rst_tups.append((current_header, current_text)) rst_tups.append((current_header, current_text))
#TODO: Format for rst # TODO: Format for rst
# #
# if current_header is not None: # if current_header is not None:
# # pass linting, assert keys are defined # # pass linting, assert keys are defined
@ -136,7 +136,7 @@ class RstParser(BaseParser):
return {} return {}
def parse_tups( def parse_tups(
self, filepath: Path, errors: str = "ignore" self, filepath: Path, errors: str = "ignore"
) -> List[Tuple[Optional[str], str]]: ) -> List[Tuple[Optional[str], str]]:
"""Parse file into tuples.""" """Parse file into tuples."""
with open(filepath, "r") as f: with open(filepath, "r") as f:
@ -159,7 +159,7 @@ class RstParser(BaseParser):
return rst_tups return rst_tups
def parse_file( def parse_file(
self, filepath: Path, errors: str = "ignore" self, filepath: Path, errors: str = "ignore"
) -> Union[str, List[str]]: ) -> Union[str, List[str]]:
"""Parse file into string.""" """Parse file into string."""
tups = self.parse_tups(filepath, errors=errors) tups = self.parse_tups(filepath, errors=errors)

@ -77,13 +77,13 @@ class PandasCSVParser(BaseParser):
""" """
def __init__( def __init__(
self, self,
*args: Any, *args: Any,
concat_rows: bool = True, concat_rows: bool = True,
col_joiner: str = ", ", col_joiner: str = ", ",
row_joiner: str = "\n", row_joiner: str = "\n",
pandas_config: dict = {}, pandas_config: dict = {},
**kwargs: Any **kwargs: Any
) -> None: ) -> None:
"""Init params.""" """Init params."""
super().__init__(*args, **kwargs) super().__init__(*args, **kwargs)

@ -1,6 +1,8 @@
import os import os
import javalang import javalang
def find_files(directory): def find_files(directory):
files_list = [] files_list = []
for root, dirs, files in os.walk(directory): for root, dirs, files in os.walk(directory):
@ -9,6 +11,7 @@ def find_files(directory):
files_list.append(os.path.join(root, file)) files_list.append(os.path.join(root, file))
return files_list return files_list
def extract_functions(file_path): def extract_functions(file_path):
with open(file_path, "r") as file: with open(file_path, "r") as file:
java_code = file.read() java_code = file.read()
@ -28,6 +31,7 @@ def extract_functions(file_path):
methods[method_name] = method_source_code methods[method_name] = method_source_code
return methods return methods
def extract_classes(file_path): def extract_classes(file_path):
with open(file_path, 'r') as file: with open(file_path, 'r') as file:
source_code = file.read() source_code = file.read()
@ -47,6 +51,7 @@ def extract_classes(file_path):
classes[class_name] = class_string classes[class_name] = class_string
return classes return classes
def extract_functions_and_classes(directory): def extract_functions_and_classes(directory):
files = find_files(directory) files = find_files(directory)
functions_dict = {} functions_dict = {}
@ -58,4 +63,4 @@ def extract_functions_and_classes(directory):
classes = extract_classes(file) classes = extract_classes(file)
if classes: if classes:
classes_dict[file] = classes classes_dict[file] = classes
return functions_dict, classes_dict return functions_dict, classes_dict

@ -1,6 +1,7 @@
import os import os
import esprima
import escodegen import escodegen
import esprima
def find_files(directory): def find_files(directory):
@ -11,6 +12,7 @@ def find_files(directory):
files_list.append(os.path.join(root, file)) files_list.append(os.path.join(root, file))
return files_list return files_list
def extract_functions(file_path): def extract_functions(file_path):
with open(file_path, 'r') as file: with open(file_path, 'r') as file:
source_code = file.read() source_code = file.read()
@ -26,7 +28,6 @@ def extract_functions(file_path):
func_name = declaration.id.name if declaration.id else '<anonymous>' func_name = declaration.id.name if declaration.id else '<anonymous>'
functions[func_name] = escodegen.generate(declaration.init) functions[func_name] = escodegen.generate(declaration.init)
elif node.type == 'ClassDeclaration': elif node.type == 'ClassDeclaration':
class_name = node.id.name
for subnode in node.body.body: for subnode in node.body.body:
if subnode.type == 'MethodDefinition': if subnode.type == 'MethodDefinition':
func_name = subnode.key.name func_name = subnode.key.name
@ -38,6 +39,7 @@ def extract_functions(file_path):
functions[func_name] = escodegen.generate(declaration.init) functions[func_name] = escodegen.generate(declaration.init)
return functions return functions
def extract_classes(file_path): def extract_classes(file_path):
with open(file_path, 'r') as file: with open(file_path, 'r') as file:
source_code = file.read() source_code = file.read()
@ -53,6 +55,7 @@ def extract_classes(file_path):
classes[class_name] = ", ".join(function_names) classes[class_name] = ", ".join(function_names)
return classes return classes
def extract_functions_and_classes(directory): def extract_functions_and_classes(directory):
files = find_files(directory) files = find_files(directory)
functions_dict = {} functions_dict = {}

@ -1,32 +1,32 @@
import os import os
import faiss
import pickle
import tiktoken import tiktoken
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
#from langchain.embeddings import HuggingFaceEmbeddings
#from langchain.embeddings import HuggingFaceInstructEmbeddings
#from langchain.embeddings import CohereEmbeddings
from retry import retry from retry import retry
# from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.embeddings import HuggingFaceInstructEmbeddings
# from langchain.embeddings import CohereEmbeddings
def num_tokens_from_string(string: str, encoding_name: str) -> int: def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost. # Function to convert string to tokens and estimate user cost.
encoding = tiktoken.get_encoding(encoding_name) encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string)) num_tokens = len(encoding.encode(string))
total_price = ((num_tokens/1000) * 0.0004) total_price = ((num_tokens / 1000) * 0.0004)
return num_tokens, total_price return num_tokens, total_price
@retry(tries=10, delay=60) @retry(tries=10, delay=60)
def store_add_texts_with_retry(store, i): def store_add_texts_with_retry(store, i):
store.add_texts([i.page_content], metadatas=[i.metadata]) store.add_texts([i.page_content], metadatas=[i.metadata])
#store_pine.add_texts([i.page_content], metadatas=[i.metadata]) # store_pine.add_texts([i.page_content], metadatas=[i.metadata])
def call_openai_api(docs, folder_name, task_status): def call_openai_api(docs, folder_name, task_status):
# Function to create a vector store from the documents and save it to disk. # Function to create a vector store from the documents and save it to disk.
# create output folder if it doesn't exist # create output folder if it doesn't exist
if not os.path.exists(f"{folder_name}"): if not os.path.exists(f"{folder_name}"):
@ -44,7 +44,8 @@ def call_openai_api(docs, folder_name, task_status):
# hf = HuggingFaceEmbeddings(model_name=model_name) # hf = HuggingFaceEmbeddings(model_name=model_name)
# store = FAISS.from_documents(docs_test, hf) # store = FAISS.from_documents(docs_test, hf)
s1 = len(docs) s1 = len(docs)
for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'): for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs),
bar_format='{l_bar}{bar}| Time Left: {remaining}'):
try: try:
task_status.update_state(state='PROGRESS', meta={'current': int((c1 / s1) * 100)}) task_status.update_state(state='PROGRESS', meta={'current': int((c1 / s1) * 100)})
store_add_texts_with_retry(store, i) store_add_texts_with_retry(store, i)
@ -58,20 +59,20 @@ def call_openai_api(docs, folder_name, task_status):
c1 += 1 c1 += 1
store.save_local(f"{folder_name}") store.save_local(f"{folder_name}")
def get_user_permission(docs, folder_name): def get_user_permission(docs, folder_name):
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds. # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents. # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
#docs_content = (" ".join(docs)) # docs_content = (" ".join(docs))
docs_content = "" docs_content = ""
for doc in docs: for doc in docs:
docs_content += doc.page_content docs_content += doc.page_content
tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base") tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
# Here we print the number of tokens and the approx user cost with some visually appealing formatting. # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
print(f"Number of Tokens = {format(tokens, ',d')}") print(f"Number of Tokens = {format(tokens, ',d')}")
print(f"Approx Cost = ${format(total_price, ',.2f')}") print(f"Approx Cost = ${format(total_price, ',.2f')}")
#Here we check for user permission before calling the API. # Here we check for user permission before calling the API.
user_input = input("Price Okay? (Y/N) \n").lower() user_input = input("Price Okay? (Y/N) \n").lower()
if user_input == "y": if user_input == "y":
call_openai_api(docs, folder_name) call_openai_api(docs, folder_name)

@ -1,10 +1,12 @@
import os
import ast import ast
import tiktoken import os
from pathlib import Path from pathlib import Path
import tiktoken
from langchain.llms import OpenAI from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate from langchain.prompts import PromptTemplate
def find_files(directory): def find_files(directory):
files_list = [] files_list = []
for root, dirs, files in os.walk(directory): for root, dirs, files in os.walk(directory):
@ -13,6 +15,7 @@ def find_files(directory):
files_list.append(os.path.join(root, file)) files_list.append(os.path.join(root, file))
return files_list return files_list
def extract_functions(file_path): def extract_functions(file_path):
with open(file_path, 'r') as file: with open(file_path, 'r') as file:
source_code = file.read() source_code = file.read()
@ -25,6 +28,7 @@ def extract_functions(file_path):
functions[func_name] = func_def functions[func_name] = func_def
return functions return functions
def extract_classes(file_path): def extract_classes(file_path):
with open(file_path, 'r') as file: with open(file_path, 'r') as file:
source_code = file.read() source_code = file.read()
@ -40,6 +44,7 @@ def extract_classes(file_path):
classes[class_name] = ", ".join(function_names) classes[class_name] = ", ".join(function_names)
return classes return classes
def extract_functions_and_classes(directory): def extract_functions_and_classes(directory):
files = find_files(directory) files = find_files(directory)
functions_dict = {} functions_dict = {}
@ -53,11 +58,12 @@ def extract_functions_and_classes(directory):
classes_dict[file] = classes classes_dict[file] = classes
return functions_dict, classes_dict return functions_dict, classes_dict
def parse_functions(functions_dict, formats, dir): def parse_functions(functions_dict, formats, dir):
c1 = len(functions_dict) c1 = len(functions_dict)
for i, (source, functions) in enumerate(functions_dict.items(), start=1): for i, (source, functions) in enumerate(functions_dict.items(), start=1):
print(f"Processing file {i}/{c1}") print(f"Processing file {i}/{c1}")
source_w = source.replace(dir+"/", "").replace("."+formats, ".md") source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
subfolders = "/".join(source_w.split("/")[:-1]) subfolders = "/".join(source_w.split("/")[:-1])
Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True) Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
for j, (name, function) in enumerate(functions.items(), start=1): for j, (name, function) in enumerate(functions.items(), start=1):
@ -70,18 +76,19 @@ def parse_functions(functions_dict, formats, dir):
response = llm(prompt.format(code=function)) response = llm(prompt.format(code=function))
mode = "a" if Path(f"outputs/{source_w}").exists() else "w" mode = "a" if Path(f"outputs/{source_w}").exists() else "w"
with open(f"outputs/{source_w}", mode) as f: with open(f"outputs/{source_w}", mode) as f:
f.write(f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}") f.write(
f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
def parse_classes(classes_dict, formats, dir): def parse_classes(classes_dict, formats, dir):
c1 = len(classes_dict) c1 = len(classes_dict)
for i, (source, classes) in enumerate(classes_dict.items()): for i, (source, classes) in enumerate(classes_dict.items()):
print(f"Processing file {i+1}/{c1}") print(f"Processing file {i + 1}/{c1}")
source_w = source.replace(dir+"/", "").replace("."+formats, ".md") source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
subfolders = "/".join(source_w.split("/")[:-1]) subfolders = "/".join(source_w.split("/")[:-1])
Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True) Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
for name, function_names in classes.items(): for name, function_names in classes.items():
print(f"Processing Class {i+1}/{c1}") print(f"Processing Class {i + 1}/{c1}")
prompt = PromptTemplate( prompt = PromptTemplate(
input_variables=["class_name", "functions_names"], input_variables=["class_name", "functions_names"],
template="Class name: {class_name} \nFunctions: {functions_names}, \nDocumentation: ", template="Class name: {class_name} \nFunctions: {functions_names}, \nDocumentation: ",
@ -92,6 +99,7 @@ def parse_classes(classes_dict, formats, dir):
with open(f"outputs/{source_w}", "a" if Path(f"outputs/{source_w}").exists() else "w") as f: with open(f"outputs/{source_w}", "a" if Path(f"outputs/{source_w}").exists() else "w") as f:
f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}") f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}")
def transform_to_docs(functions_dict, classes_dict, formats, dir): def transform_to_docs(functions_dict, classes_dict, formats, dir):
docs_content = ''.join([str(key) + str(value) for key, value in functions_dict.items()]) docs_content = ''.join([str(key) + str(value) for key, value in functions_dict.items()])
docs_content += ''.join([str(key) + str(value) for key, value in classes_dict.items()]) docs_content += ''.join([str(key) + str(value) for key, value in classes_dict.items()])
@ -110,4 +118,4 @@ def transform_to_docs(functions_dict, classes_dict, formats, dir):
parse_classes(classes_dict, formats, dir) parse_classes(classes_dict, formats, dir)
print("All done!") print("All done!")
else: else:
print("The API was not called. No money was spent.") print("The API was not called. No money was spent.")

@ -2,7 +2,6 @@
from dataclasses import dataclass from dataclasses import dataclass
from langchain.docstore.document import Document as LCDocument from langchain.docstore.document import Document as LCDocument
from parser.schema.schema import BaseDocument from parser.schema.schema import BaseDocument

@ -1,9 +1,9 @@
import re import re
import tiktoken from math import ceil
from typing import List from typing import List
import tiktoken
from parser.schema.base import Document from parser.schema.base import Document
from math import ceil
def separate_header_and_body(text): def separate_header_and_body(text):
@ -13,6 +13,7 @@ def separate_header_and_body(text):
body = text[len(header):] body = text[len(header):]
return header, body return header, body
def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]: def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]:
docs = [] docs = []
current_group = None current_group = None
@ -23,7 +24,8 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
if current_group is None: if current_group is None:
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding, current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
extra_info=doc.extra_info) extra_info=doc.extra_info)
elif len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens: elif len(tiktoken.get_encoding("cl100k_base").encode(
current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
current_group.text += " " + doc.text current_group.text += " " + doc.text
else: else:
docs.append(current_group) docs.append(current_group)
@ -35,6 +37,7 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
return docs return docs
def split_documents(documents: List[Document], max_tokens: int) -> List[Document]: def split_documents(documents: List[Document], max_tokens: int) -> List[Document]:
docs = [] docs = []
for doc in documents: for doc in documents:
@ -54,17 +57,18 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document
docs.append(new_doc) docs.append(new_doc)
return docs return docs
def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True): def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
if token_check == False: if not token_check:
return documents return documents
print("Grouping small documents") print("Grouping small documents")
try: try:
documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens) documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
except: except Exception:
print("Grouping failed, try running without token_check") print("Grouping failed, try running without token_check")
print("Separating large documents") print("Separating large documents")
try: try:
documents = split_documents(documents=documents, max_tokens=max_tokens) documents = split_documents(documents=documents, max_tokens=max_tokens)
except: except Exception:
print("Grouping failed, try running without token_check") print("Grouping failed, try running without token_check")
return documents return documents

@ -1,18 +1,17 @@
import requests
import nltk
import os import os
import shutil
import string
import zipfile
from urllib.parse import urljoin
import nltk
import requests
from core.settings import settings
from parser.file.bulk import SimpleDirectoryReader from parser.file.bulk import SimpleDirectoryReader
from parser.schema.base import Document
from parser.open_ai_func import call_openai_api from parser.open_ai_func import call_openai_api
from parser.schema.base import Document
from parser.token_func import group_split from parser.token_func import group_split
from urllib.parse import urljoin
from core.settings import settings
import string
import zipfile
import shutil
try: try:
nltk.download('punkt', quiet=True) nltk.download('punkt', quiet=True)
@ -50,7 +49,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
with open(full_path + '/' + filename, 'wb') as f: with open(full_path + '/' + filename, 'wb') as f:
f.write(file) f.write(file)
#check if file is .zip and extract it # check if file is .zip and extract it
if filename.endswith('.zip'): if filename.endswith('.zip'):
with zipfile.ZipFile(full_path + '/' + filename, 'r') as zip_ref: with zipfile.ZipFile(full_path + '/' + filename, 'r') as zip_ref:
zip_ref.extractall(full_path) zip_ref.extractall(full_path)
@ -68,7 +67,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
call_openai_api(docs, full_path, self) call_openai_api(docs, full_path, self)
self.update_state(state='PROGRESS', meta={'current': 100}) self.update_state(state='PROGRESS', meta={'current': 100})
if sample == True: if sample:
for i in range(min(5, len(raw_docs))): for i in range(min(5, len(raw_docs))):
print(raw_docs[i].text) print(raw_docs[i].text)

@ -1,4 +1,4 @@
from app import app from app import app
if __name__ == "__main__": if __name__ == "__main__":
app.run(debug=True, port=5001) app.run(debug=True, port=5001)

@ -1,18 +1,20 @@
import requests
import dotenv
import os import os
import json
import pprint import pprint
import dotenv
import requests
from flask import Flask, request
dotenv.load_dotenv() dotenv.load_dotenv()
docsgpt_url = os.getenv("docsgpt_url") docsgpt_url = os.getenv("docsgpt_url")
chatwoot_url = os.getenv("chatwoot_url") chatwoot_url = os.getenv("chatwoot_url")
docsgpt_key = os.getenv("docsgpt_key") docsgpt_key = os.getenv("docsgpt_key")
chatwoot_token = os.getenv("chatwoot_token") chatwoot_token = os.getenv("chatwoot_token")
#account_id = os.getenv("account_id") # account_id = os.getenv("account_id")
#assignee_id = os.getenv("assignee_id") # assignee_id = os.getenv("assignee_id")
label_stop = "human-requested" label_stop = "human-requested"
def send_to_bot(sender, message): def send_to_bot(sender, message):
data = { data = {
'sender': sender, 'sender': sender,
@ -43,7 +45,6 @@ def send_to_chatwoot(account, conversation, message):
return r.json() return r.json()
from flask import Flask, request
app = Flask(__name__) app = Flask(__name__)
@ -74,7 +75,7 @@ def docsgpt():
# elif str(assignee) != str(assignee_id): # elif str(assignee) != str(assignee_id):
# return "Not the right assignee" # return "Not the right assignee"
if(message_type == "incoming"): if (message_type == "incoming"):
bot_response = send_to_bot(contact, message) bot_response = send_to_bot(contact, message)
create_message = send_to_chatwoot( create_message = send_to_chatwoot(
account, conversation, bot_response) account, conversation, bot_response)
@ -83,5 +84,6 @@ def docsgpt():
return create_message return create_message
if __name__ == '__main__': if __name__ == '__main__':
app.run(host='0.0.0.0', port=80) app.run(host='0.0.0.0', port=80)

@ -1,17 +1,10 @@
import ast
import json
from pathlib import Path from pathlib import Path
from langchain.text_splitter import CharacterTextSplitter
import faiss import dotenv
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.llms import OpenAI from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate from langchain.prompts import PromptTemplate
import pickle
import dotenv
import tiktoken
import sys
from argparse import ArgumentParser
import ast
import json
dotenv.load_dotenv() dotenv.load_dotenv()
@ -24,12 +17,6 @@ for p in ps:
sources.append(p) sources.append(p)
# with open('inputs/client.py', 'r') as f:
# tree = ast.parse(f.read())
# print(tree)
def get_functions_in_class(node): def get_functions_in_class(node):
functions = [] functions = []
functions_code = [] functions_code = []
@ -66,16 +53,6 @@ for code in data:
with open('structure_dict.json', 'w') as f: with open('structure_dict.json', 'w') as f:
json.dump(structure_dict, f) json.dump(structure_dict, f)
# llm = OpenAI(temperature=0)
# prompt = PromptTemplate(
# input_variables=["code"],
# template="Code: {code}, Documentation: ",
# )
#
# print(prompt.format(code="print('hello world')"))
# print(llm(prompt.format(code="print('hello world')")))
if not Path("outputs").exists(): if not Path("outputs").exists():
Path("outputs").mkdir() Path("outputs").mkdir()

@ -1,19 +1,19 @@
import os import os
import sys import sys
import nltk
import dotenv
import typer
from collections import defaultdict from collections import defaultdict
from typing import List, Optional from typing import List, Optional
import dotenv
import nltk
import typer
from parser.file.bulk import SimpleDirectoryReader from parser.file.bulk import SimpleDirectoryReader
from parser.schema.base import Document from parser.java2doc import extract_functions_and_classes as extract_java
from parser.js2doc import extract_functions_and_classes as extract_js
from parser.open_ai_func import call_openai_api, get_user_permission from parser.open_ai_func import call_openai_api, get_user_permission
from parser.py2doc import transform_to_docs
from parser.py2doc import extract_functions_and_classes as extract_py from parser.py2doc import extract_functions_and_classes as extract_py
from parser.js2doc import extract_functions_and_classes as extract_js from parser.py2doc import transform_to_docs
from parser.java2doc import extract_functions_and_classes as extract_java from parser.schema.base import Document
from parser.token_func import group_split from parser.token_func import group_split
dotenv.load_dotenv() dotenv.load_dotenv()
@ -38,7 +38,8 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
limit: Optional[int] = typer.Option(None, help="Maximum number of files to read."), limit: Optional[int] = typer.Option(None, help="Maximum number of files to read."),
formats: Optional[List[str]] = typer.Option([".rst", ".md"], formats: Optional[List[str]] = typer.Option([".rst", ".md"],
help="""List of required extensions (list with .) help="""List of required extensions (list with .)
Currently supported: .rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""), Currently supported:
.rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""),
exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles)."), exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles)."),
sample: Optional[bool] = typer.Option(False, sample: Optional[bool] = typer.Option(False,
help="Whether to output sample of the first 5 split documents."), help="Whether to output sample of the first 5 split documents."),
@ -65,7 +66,7 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
# docs = text_splitter.split_documents(raw_docs) # docs = text_splitter.split_documents(raw_docs)
# Sample feature # Sample feature
if sample == True: if sample:
for i in range(min(5, len(raw_docs))): for i in range(min(5, len(raw_docs))):
print(raw_docs[i].text) print(raw_docs[i].text)

@ -1,38 +1,42 @@
from pathlib import Path
from langchain.text_splitter import CharacterTextSplitter
import faiss
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
import pickle import pickle
import dotenv
import tiktoken
import sys import sys
from argparse import ArgumentParser from argparse import ArgumentParser
from pathlib import Path
import dotenv
import faiss
import tiktoken
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
def num_tokens_from_string(string: str, encoding_name: str) -> int: def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost. # Function to convert string to tokens and estimate user cost.
encoding = tiktoken.get_encoding(encoding_name) encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string)) num_tokens = len(encoding.encode(string))
total_price = ((num_tokens/1000) * 0.0004) total_price = ((num_tokens / 1000) * 0.0004)
return num_tokens, total_price return num_tokens, total_price
def call_openai_api(): def call_openai_api():
# Function to create a vector store from the documents and save it to disk. # Function to create a vector store from the documents and save it to disk.
store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas) store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
faiss.write_index(store.index, "docs.index") faiss.write_index(store.index, "docs.index")
store.index = None store.index = None
with open("faiss_store.pkl", "wb") as f: with open("faiss_store.pkl", "wb") as f:
pickle.dump(store, f) pickle.dump(store, f)
def get_user_permission(): def get_user_permission():
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds. # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents. # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
docs_content = (" ".join(docs)) docs_content = (" ".join(docs))
tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base") tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
# Here we print the number of tokens and the approx user cost with some visually appealing formatting. # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
print(f"Number of Tokens = {format(tokens, ',d')}") print(f"Number of Tokens = {format(tokens, ',d')}")
print(f"Approx Cost = ${format(total_price, ',.2f')}") print(f"Approx Cost = ${format(total_price, ',.2f')}")
#Here we check for user permission before calling the API. # Here we check for user permission before calling the API.
user_input = input("Price Okay? (Y/N) \n").lower() user_input = input("Price Okay? (Y/N) \n").lower()
if user_input == "y": if user_input == "y":
call_openai_api() call_openai_api()
@ -41,7 +45,8 @@ def get_user_permission():
else: else:
print("The API was not called. No money was spent.") print("The API was not called. No money was spent.")
#Load .env file
# Load .env file
dotenv.load_dotenv() dotenv.load_dotenv()
ap = ArgumentParser("Script for training DocsGPT on .rst documentation files.") ap = ArgumentParser("Script for training DocsGPT on .rst documentation files.")

@ -1,71 +1,75 @@
import os import os
import pickle import pickle
import dotenv
import tiktoken
import sys
import faiss
import shutil import shutil
import sys
from argparse import ArgumentParser
from pathlib import Path from pathlib import Path
from langchain.vectorstores import FAISS
import dotenv
import faiss
import tiktoken
from langchain.embeddings import OpenAIEmbeddings from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import FAISS
from sphinx.cmd.build import main as sphinx_main from sphinx.cmd.build import main as sphinx_main
from argparse import ArgumentParser
def convert_rst_to_txt(src_dir, dst_dir): def convert_rst_to_txt(src_dir, dst_dir):
# Check if the source directory exists # Check if the source directory exists
if not os.path.exists(src_dir): if not os.path.exists(src_dir):
raise Exception("Source directory does not exist") raise Exception("Source directory does not exist")
# Walk through the source directory # Walk through the source directory
for root, dirs, files in os.walk(src_dir): for root, dirs, files in os.walk(src_dir):
for file in files: for file in files:
# Check if the file has .rst extension # Check if the file has .rst extension
if file.endswith(".rst"): if file.endswith(".rst"):
# Construct the full path of the file # Construct the full path of the file
src_file = os.path.join(root, file.replace(".rst", "")) src_file = os.path.join(root, file.replace(".rst", ""))
# Convert the .rst file to .txt file using sphinx-build # Convert the .rst file to .txt file using sphinx-build
args = f". -b text -D extensions=sphinx.ext.autodoc " \ args = f". -b text -D extensions=sphinx.ext.autodoc " \
f"-D master_doc={src_file} " \ f"-D master_doc={src_file} " \
f"-D source_suffix=.rst " \ f"-D source_suffix=.rst " \
f"-C {dst_dir} " f"-C {dst_dir} "
sphinx_main(args.split()) sphinx_main(args.split())
elif file.endswith(".md"): elif file.endswith(".md"):
# Rename the .md file to .rst file # Rename the .md file to .rst file
src_file = os.path.join(root, file) src_file = os.path.join(root, file)
dst_file = os.path.join(root, file.replace(".md", ".rst")) dst_file = os.path.join(root, file.replace(".md", ".rst"))
os.rename(src_file, dst_file) os.rename(src_file, dst_file)
# Convert the .rst file to .txt file using sphinx-build # Convert the .rst file to .txt file using sphinx-build
args = f". -b text -D extensions=sphinx.ext.autodoc " \ args = f". -b text -D extensions=sphinx.ext.autodoc " \
f"-D master_doc={dst_file} " \ f"-D master_doc={dst_file} " \
f"-D source_suffix=.rst " \ f"-D source_suffix=.rst " \
f"-C {dst_dir} " f"-C {dst_dir} "
sphinx_main(args.split()) sphinx_main(args.split())
def num_tokens_from_string(string: str, encoding_name: str) -> int: def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost. # Function to convert string to tokens and estimate user cost.
encoding = tiktoken.get_encoding(encoding_name) encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string)) num_tokens = len(encoding.encode(string))
total_price = ((num_tokens/1000) * 0.0004) total_price = ((num_tokens / 1000) * 0.0004)
return num_tokens, total_price return num_tokens, total_price
def call_openai_api(): def call_openai_api():
# Function to create a vector store from the documents and save it to disk. # Function to create a vector store from the documents and save it to disk.
store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas) store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
faiss.write_index(store.index, "docs.index") faiss.write_index(store.index, "docs.index")
store.index = None store.index = None
with open("faiss_store.pkl", "wb") as f: with open("faiss_store.pkl", "wb") as f:
pickle.dump(store, f) pickle.dump(store, f)
def get_user_permission(): def get_user_permission():
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds. # Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
# Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents. # Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
docs_content = (" ".join(docs)) docs_content = (" ".join(docs))
tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base") tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
# Here we print the number of tokens and the approx user cost with some visually appealing formatting. # Here we print the number of tokens and the approx user cost with some visually appealing formatting.
print(f"Number of Tokens = {format(tokens, ',d')}") print(f"Number of Tokens = {format(tokens, ',d')}")
print(f"Approx Cost = ${format(total_price, ',.2f')}") print(f"Approx Cost = ${format(total_price, ',.2f')}")
#Here we check for user permission before calling the API. # Here we check for user permission before calling the API.
user_input = input("Price Okay? (Y/N) \n").lower() user_input = input("Price Okay? (Y/N) \n").lower()
if user_input == "y": if user_input == "y":
call_openai_api() call_openai_api()
@ -74,6 +78,7 @@ def get_user_permission():
else: else:
print("The API was not called. No money was spent.") print("The API was not called. No money was spent.")
ap = ArgumentParser("Script for training DocsGPT on Sphinx documentation") ap = ArgumentParser("Script for training DocsGPT on Sphinx documentation")
ap.add_argument("-i", "--inputs", ap.add_argument("-i", "--inputs",
type=str, type=str,
@ -81,17 +86,17 @@ ap.add_argument("-i", "--inputs",
help="Directory containing documentation files") help="Directory containing documentation files")
args = ap.parse_args() args = ap.parse_args()
#Load .env file # Load .env file
dotenv.load_dotenv() dotenv.load_dotenv()
#Directory to vector # Directory to vector
src_dir = args.inputs src_dir = args.inputs
dst_dir = "tmp" dst_dir = "tmp"
convert_rst_to_txt(src_dir, dst_dir) convert_rst_to_txt(src_dir, dst_dir)
# Here we load in the data in the format that Notion exports it in. # Here we load in the data in the format that Notion exports it in.
ps = list(Path("tmp/"+ src_dir).glob("**/*.txt")) ps = list(Path("tmp/" + src_dir).glob("**/*.txt"))
# parse all child directories # parse all child directories
data = [] data = []

@ -3,7 +3,6 @@ from abc import abstractmethod
from typing import Any, List from typing import Any, List
from langchain.docstore.document import Document as LCDocument from langchain.docstore.document import Document as LCDocument
from parser.schema.base import Document from parser.schema.base import Document

@ -24,12 +24,11 @@ class HTMLParser(BaseParser):
Union[str, List[str]]: a string or a List of strings. Union[str, List[str]]: a string or a List of strings.
""" """
try: try:
import unstructured from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
except ImportError: except ImportError:
raise ValueError("unstructured package is required to parse HTML files.") raise ValueError("unstructured package is required to parse HTML files.")
from unstructured.partition.html import partition_html
from unstructured.staging.base import convert_to_isd
from unstructured.cleaners.core import clean
# Using the unstructured library to convert the html to isd format # Using the unstructured library to convert the html to isd format
# isd sample : isd = [ # isd sample : isd = [
@ -70,7 +69,8 @@ class HTMLParser(BaseParser):
Chunks.append([]) Chunks.append([])
Chunks[-1].append(isd_el['text']) Chunks[-1].append(isd_el['text'])
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable # Removing all the chunks with sum of lenth of all the strings in the chunk < 25
# TODO: This value can be a user defined variable
for chunk in Chunks: for chunk in Chunks:
# sum of lenth of all the strings in the chunk # sum of lenth of all the strings in the chunk
sum = 0 sum = 0

@ -7,8 +7,8 @@ import re
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast from typing import Any, Dict, List, Optional, Tuple, Union, cast
from parser.file.base_parser import BaseParser
import tiktoken import tiktoken
from parser.file.base_parser import BaseParser
class MarkdownParser(BaseParser): class MarkdownParser(BaseParser):

@ -5,10 +5,9 @@ Contains parser for md files.
""" """
import re import re
from pathlib import Path from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union, cast from typing import Any, Dict, List, Optional, Tuple, Union
from parser.file.base_parser import BaseParser from parser.file.base_parser import BaseParser
import tiktoken
class RstParser(BaseParser): class RstParser(BaseParser):

@ -1,4 +1,5 @@
import os import os
import javalang import javalang

@ -1,6 +1,7 @@
import os import os
import esprima
import escodegen import escodegen
import esprima
def find_files(directory): def find_files(directory):
@ -27,7 +28,6 @@ def extract_functions(file_path):
func_name = declaration.id.name if declaration.id else '<anonymous>' func_name = declaration.id.name if declaration.id else '<anonymous>'
functions[func_name] = escodegen.generate(declaration.init) functions[func_name] = escodegen.generate(declaration.init)
elif node.type == 'ClassDeclaration': elif node.type == 'ClassDeclaration':
class_name = node.id.name
for subnode in node.body.body: for subnode in node.body.body:
if subnode.type == 'MethodDefinition': if subnode.type == 'MethodDefinition':
func_name = subnode.key.name func_name = subnode.key.name

@ -1,16 +1,15 @@
import os import os
import faiss
import pickle
import tiktoken import tiktoken
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from retry import retry
# from langchain.embeddings import HuggingFaceEmbeddings # from langchain.embeddings import HuggingFaceEmbeddings
# from langchain.embeddings import HuggingFaceInstructEmbeddings # from langchain.embeddings import HuggingFaceInstructEmbeddings
# from langchain.embeddings import CohereEmbeddings # from langchain.embeddings import CohereEmbeddings
from retry import retry
def num_tokens_from_string(string: str, encoding_name: str) -> int: def num_tokens_from_string(string: str, encoding_name: str) -> int:
# Function to convert string to tokens and estimate user cost. # Function to convert string to tokens and estimate user cost.

@ -1,7 +1,8 @@
import os
import ast import ast
import tiktoken import os
from pathlib import Path from pathlib import Path
import tiktoken
from langchain.llms import OpenAI from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate from langchain.prompts import PromptTemplate

@ -2,7 +2,6 @@
from dataclasses import dataclass from dataclasses import dataclass
from langchain.docstore.document import Document as LCDocument from langchain.docstore.document import Document as LCDocument
from parser.schema.schema import BaseDocument from parser.schema.schema import BaseDocument

@ -1,9 +1,9 @@
import re import re
import tiktoken from math import ceil
from typing import List from typing import List
import tiktoken
from parser.schema.base import Document from parser.schema.base import Document
from math import ceil
def separate_header_and_body(text): def separate_header_and_body(text):
@ -59,16 +59,16 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document
def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True): def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
if token_check == False: if not token_check:
return documents return documents
print("Grouping small documents") print("Grouping small documents")
try: try:
documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens) documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
except: except Exception:
print("Grouping failed, try running without token_check") print("Grouping failed, try running without token_check")
print("Separating large documents") print("Separating large documents")
try: try:
documents = split_documents(documents=documents, max_tokens=max_tokens) documents = split_documents(documents=documents, max_tokens=max_tokens)
except: except Exception:
print("Grouping failed, try running without token_check") print("Grouping failed, try running without token_check")
return documents return documents

Loading…
Cancel
Save