mirror of
https://github.com/arc53/DocsGPT
synced 2024-11-17 21:26:26 +00:00
commit
059ffe09ea
2
.github/workflows/ci.yml
vendored
2
.github/workflows/ci.yml
vendored
@ -23,7 +23,7 @@ jobs:
|
||||
with:
|
||||
username: ${{ secrets.DOCKER_USERNAME }}
|
||||
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||
|
||||
|
||||
- name: Login to ghcr.io
|
||||
uses: docker/login-action@v2
|
||||
with:
|
||||
|
17
.github/workflows/lint.yml
vendored
Normal file
17
.github/workflows/lint.yml
vendored
Normal file
@ -0,0 +1,17 @@
|
||||
name: Python linting
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- '*'
|
||||
pull_request:
|
||||
types: [ opened, synchronize ]
|
||||
|
||||
jobs:
|
||||
ruff:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
|
||||
- name: Lint with Ruff
|
||||
uses: chartboost/ruff-action@v1
|
2
.ruff.toml
Normal file
2
.ruff.toml
Normal file
@ -0,0 +1,2 @@
|
||||
# Allow lines to be as long as 120 characters.
|
||||
line-length = 120
|
@ -1,8 +1,9 @@
|
||||
import asyncio
|
||||
import datetime
|
||||
import http.client
|
||||
import json
|
||||
import os
|
||||
import traceback
|
||||
import asyncio
|
||||
|
||||
import dotenv
|
||||
import requests
|
||||
@ -26,10 +27,9 @@ from langchain.prompts.chat import (
|
||||
from pymongo import MongoClient
|
||||
from werkzeug.utils import secure_filename
|
||||
|
||||
from core.settings import settings
|
||||
from error import bad_request
|
||||
from worker import ingest_worker
|
||||
from core.settings import settings
|
||||
import celeryconfig
|
||||
|
||||
# os.environ["LANGCHAIN_HANDLER"] = "langchain"
|
||||
|
||||
@ -177,18 +177,12 @@ def api_answer():
|
||||
q_prompt = PromptTemplate(input_variables=["context", "question"], template=template_quest,
|
||||
template_format="jinja2")
|
||||
if settings.LLM_NAME == "openai_chat":
|
||||
# llm = ChatOpenAI(openai_api_key=api_key, model_name="gpt-4")
|
||||
llm = ChatOpenAI(openai_api_key=api_key)
|
||||
llm = ChatOpenAI(openai_api_key=api_key) # optional parameter: model_name="gpt-4"
|
||||
messages_combine = [
|
||||
SystemMessagePromptTemplate.from_template(chat_combine_template),
|
||||
HumanMessagePromptTemplate.from_template("{question}")
|
||||
]
|
||||
p_chat_combine = ChatPromptTemplate.from_messages(messages_combine)
|
||||
messages_reduce = [
|
||||
SystemMessagePromptTemplate.from_template(chat_reduce_template),
|
||||
HumanMessagePromptTemplate.from_template("{question}")
|
||||
]
|
||||
p_chat_reduce = ChatPromptTemplate.from_messages(messages_reduce)
|
||||
elif settings.LLM_NAME == "openai":
|
||||
llm = OpenAI(openai_api_key=api_key, temperature=0)
|
||||
elif settings.LLM_NAME == "manifest":
|
||||
@ -226,7 +220,7 @@ def api_answer():
|
||||
result['answer'] = result['answer'].replace("\\n", "\n")
|
||||
try:
|
||||
result['answer'] = result['answer'].split("SOURCES:")[0]
|
||||
except:
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# mock result
|
||||
@ -295,7 +289,7 @@ def api_feedback():
|
||||
"feedback": feedback
|
||||
})
|
||||
)
|
||||
return {"status": 'ok'}
|
||||
return {"status": http.client.responses.get(response.status_code, 'ok')}
|
||||
|
||||
|
||||
@app.route('/api/combine', methods=['GET'])
|
||||
|
@ -1,7 +1,8 @@
|
||||
import os
|
||||
|
||||
broker_url = os.getenv("CELERY_BROKER_URL")
|
||||
result_backend = os.getenv("CELERY_RESULT_BACKEND")
|
||||
|
||||
task_serializer = 'json'
|
||||
result_serializer = 'json'
|
||||
accept_content = ['json']
|
||||
accept_content = ['json']
|
||||
|
@ -1,6 +1,7 @@
|
||||
from pydantic import BaseSettings
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import BaseSettings
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
LLM_NAME: str = "openai_chat"
|
||||
|
@ -1,13 +1,15 @@
|
||||
from flask import jsonify
|
||||
from werkzeug.http import HTTP_STATUS_CODES
|
||||
|
||||
def response_error(code_status,message=None):
|
||||
payload = {'error':HTTP_STATUS_CODES.get(code_status,"something went wrong")}
|
||||
|
||||
def response_error(code_status, message=None):
|
||||
payload = {'error': HTTP_STATUS_CODES.get(code_status, "something went wrong")}
|
||||
if message:
|
||||
payload['message'] = message
|
||||
response = jsonify(payload)
|
||||
response.status_code = code_status
|
||||
return response
|
||||
|
||||
def bad_request(status_code=400,message=''):
|
||||
return response_error(code_status=status_code,message=message)
|
||||
|
||||
def bad_request(status_code=400, message=''):
|
||||
return response_error(code_status=status_code, message=message)
|
||||
|
@ -3,7 +3,6 @@ from abc import abstractmethod
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.docstore.document import Document as LCDocument
|
||||
|
||||
from parser.schema.base import Document
|
||||
|
||||
|
||||
|
@ -9,6 +9,7 @@ from typing import Dict, Union
|
||||
|
||||
from parser.file.base_parser import BaseParser
|
||||
|
||||
|
||||
class HTMLParser(BaseParser):
|
||||
"""HTML parser."""
|
||||
|
||||
@ -23,38 +24,37 @@ class HTMLParser(BaseParser):
|
||||
Union[str, List[str]]: a string or a List of strings.
|
||||
"""
|
||||
try:
|
||||
import unstructured
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.staging.base import convert_to_isd
|
||||
from unstructured.cleaners.core import clean
|
||||
except ImportError:
|
||||
raise ValueError("unstructured package is required to parse HTML files.")
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.staging.base import convert_to_isd
|
||||
from unstructured.cleaners.core import clean
|
||||
|
||||
# Using the unstructured library to convert the html to isd format
|
||||
# isd sample : isd = [
|
||||
# {"text": "My Title", "type": "Title"},
|
||||
# {"text": "My Narrative", "type": "NarrativeText"}
|
||||
# ]
|
||||
# {"text": "My Title", "type": "Title"},
|
||||
# {"text": "My Narrative", "type": "NarrativeText"}
|
||||
# ]
|
||||
with open(file, "r", encoding="utf-8") as fp:
|
||||
elements = partition_html(file=fp)
|
||||
isd = convert_to_isd(elements)
|
||||
isd = convert_to_isd(elements)
|
||||
|
||||
# Removing non ascii charactwers from isd_el['text']
|
||||
# Removing non ascii charactwers from isd_el['text']
|
||||
for isd_el in isd:
|
||||
isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()
|
||||
|
||||
# Removing all the \n characters from isd_el['text'] using regex and replace with single space
|
||||
# Removing all the extra spaces from isd_el['text'] using regex and replace with single space
|
||||
for isd_el in isd:
|
||||
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE|re.DOTALL)
|
||||
isd_el['text'] = re.sub(r"\s{2,}"," ", isd_el['text'], flags=re.MULTILINE|re.DOTALL)
|
||||
isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
|
||||
isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)
|
||||
|
||||
# more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
|
||||
for isd_el in isd:
|
||||
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True )
|
||||
clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)
|
||||
|
||||
# Creating a list of all the indexes of isd_el['type'] = 'Title'
|
||||
title_indexes = [i for i,isd_el in enumerate(isd) if isd_el['type'] == 'Title']
|
||||
title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']
|
||||
|
||||
# Creating 'Chunks' - List of lists of strings
|
||||
# each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
|
||||
@ -64,19 +64,20 @@ class HTMLParser(BaseParser):
|
||||
Chunks = [[]]
|
||||
final_chunks = list(list())
|
||||
|
||||
for i,isd_el in enumerate(isd):
|
||||
for i, isd_el in enumerate(isd):
|
||||
if i in title_indexes:
|
||||
Chunks.append([])
|
||||
Chunks[-1].append(isd_el['text'])
|
||||
|
||||
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
|
||||
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25
|
||||
# TODO: This value can be an user defined variable
|
||||
for chunk in Chunks:
|
||||
# sum of lenth of all the strings in the chunk
|
||||
sum = 0
|
||||
sum += len(str(chunk))
|
||||
if sum < 25:
|
||||
Chunks.remove(chunk)
|
||||
else :
|
||||
else:
|
||||
# appending all the approved chunks to final_chunks as a single string
|
||||
final_chunks.append(" ".join([str(item) for item in chunk]))
|
||||
return final_chunks
|
||||
|
@ -7,8 +7,8 @@ import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
||||
|
||||
from parser.file.base_parser import BaseParser
|
||||
import tiktoken
|
||||
from parser.file.base_parser import BaseParser
|
||||
|
||||
|
||||
class MarkdownParser(BaseParser):
|
||||
@ -20,13 +20,13 @@ class MarkdownParser(BaseParser):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args: Any,
|
||||
remove_hyperlinks: bool = True,
|
||||
remove_images: bool = True,
|
||||
max_tokens: int = 2048,
|
||||
# remove_tables: bool = True,
|
||||
**kwargs: Any,
|
||||
self,
|
||||
*args: Any,
|
||||
remove_hyperlinks: bool = True,
|
||||
remove_images: bool = True,
|
||||
max_tokens: int = 2048,
|
||||
# remove_tables: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Init params."""
|
||||
super().__init__(*args, **kwargs)
|
||||
@ -35,8 +35,8 @@ class MarkdownParser(BaseParser):
|
||||
self._max_tokens = max_tokens
|
||||
# self._remove_tables = remove_tables
|
||||
|
||||
|
||||
def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str):
|
||||
def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str],
|
||||
current_text: str):
|
||||
"""Append to tups chunk."""
|
||||
num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
|
||||
if num_tokens > self._max_tokens:
|
||||
@ -46,6 +46,7 @@ class MarkdownParser(BaseParser):
|
||||
else:
|
||||
tups.append((current_header, current_text))
|
||||
return tups
|
||||
|
||||
def markdown_to_tups(self, markdown_text: str) -> List[Tuple[Optional[str], str]]:
|
||||
"""Convert a markdown file to a dictionary.
|
||||
|
||||
@ -115,7 +116,7 @@ class MarkdownParser(BaseParser):
|
||||
return {}
|
||||
|
||||
def parse_tups(
|
||||
self, filepath: Path, errors: str = "ignore"
|
||||
self, filepath: Path, errors: str = "ignore"
|
||||
) -> List[Tuple[Optional[str], str]]:
|
||||
"""Parse file into tuples."""
|
||||
with open(filepath, "r") as f:
|
||||
@ -130,7 +131,7 @@ class MarkdownParser(BaseParser):
|
||||
return markdown_tups
|
||||
|
||||
def parse_file(
|
||||
self, filepath: Path, errors: str = "ignore"
|
||||
self, filepath: Path, errors: str = "ignore"
|
||||
) -> Union[str, List[str]]:
|
||||
"""Parse file into string."""
|
||||
tups = self.parse_tups(filepath, errors=errors)
|
||||
|
@ -5,10 +5,10 @@ Contains parser for md files.
|
||||
"""
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from parser.file.base_parser import BaseParser
|
||||
import tiktoken
|
||||
|
||||
|
||||
class RstParser(BaseParser):
|
||||
"""reStructuredText parser.
|
||||
@ -19,17 +19,17 @@ class RstParser(BaseParser):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args: Any,
|
||||
remove_hyperlinks: bool = True,
|
||||
remove_images: bool = True,
|
||||
remove_table_excess: bool = True,
|
||||
remove_interpreters: bool = True,
|
||||
remove_directives: bool = True,
|
||||
remove_whitespaces_excess: bool = True,
|
||||
#Be carefull with remove_characters_excess, might cause data loss
|
||||
remove_characters_excess: bool = True,
|
||||
**kwargs: Any,
|
||||
self,
|
||||
*args: Any,
|
||||
remove_hyperlinks: bool = True,
|
||||
remove_images: bool = True,
|
||||
remove_table_excess: bool = True,
|
||||
remove_interpreters: bool = True,
|
||||
remove_directives: bool = True,
|
||||
remove_whitespaces_excess: bool = True,
|
||||
# Be carefull with remove_characters_excess, might cause data loss
|
||||
remove_characters_excess: bool = True,
|
||||
**kwargs: Any,
|
||||
) -> None:
|
||||
"""Init params."""
|
||||
super().__init__(*args, **kwargs)
|
||||
@ -41,7 +41,6 @@ class RstParser(BaseParser):
|
||||
self._remove_whitespaces_excess = remove_whitespaces_excess
|
||||
self._remove_characters_excess = remove_characters_excess
|
||||
|
||||
|
||||
def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:
|
||||
"""Convert a reStructuredText file to a dictionary.
|
||||
|
||||
@ -56,7 +55,8 @@ class RstParser(BaseParser):
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
header_match = re.match(r"^[^\S\n]*[-=]+[^\S\n]*$", line)
|
||||
if header_match and i > 0 and (len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
|
||||
if header_match and i > 0 and (
|
||||
len(lines[i - 1].strip()) == len(header_match.group().strip()) or lines[i - 2] == lines[i - 2]):
|
||||
if current_header is not None:
|
||||
if current_text == "" or None:
|
||||
continue
|
||||
@ -72,7 +72,7 @@ class RstParser(BaseParser):
|
||||
|
||||
rst_tups.append((current_header, current_text))
|
||||
|
||||
#TODO: Format for rst
|
||||
# TODO: Format for rst
|
||||
#
|
||||
# if current_header is not None:
|
||||
# # pass linting, assert keys are defined
|
||||
@ -136,7 +136,7 @@ class RstParser(BaseParser):
|
||||
return {}
|
||||
|
||||
def parse_tups(
|
||||
self, filepath: Path, errors: str = "ignore"
|
||||
self, filepath: Path, errors: str = "ignore"
|
||||
) -> List[Tuple[Optional[str], str]]:
|
||||
"""Parse file into tuples."""
|
||||
with open(filepath, "r") as f:
|
||||
@ -159,7 +159,7 @@ class RstParser(BaseParser):
|
||||
return rst_tups
|
||||
|
||||
def parse_file(
|
||||
self, filepath: Path, errors: str = "ignore"
|
||||
self, filepath: Path, errors: str = "ignore"
|
||||
) -> Union[str, List[str]]:
|
||||
"""Parse file into string."""
|
||||
tups = self.parse_tups(filepath, errors=errors)
|
||||
|
@ -77,13 +77,13 @@ class PandasCSVParser(BaseParser):
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*args: Any,
|
||||
concat_rows: bool = True,
|
||||
col_joiner: str = ", ",
|
||||
row_joiner: str = "\n",
|
||||
pandas_config: dict = {},
|
||||
**kwargs: Any
|
||||
self,
|
||||
*args: Any,
|
||||
concat_rows: bool = True,
|
||||
col_joiner: str = ", ",
|
||||
row_joiner: str = "\n",
|
||||
pandas_config: dict = {},
|
||||
**kwargs: Any
|
||||
) -> None:
|
||||
"""Init params."""
|
||||
super().__init__(*args, **kwargs)
|
||||
|
@ -1,6 +1,8 @@
|
||||
import os
|
||||
|
||||
import javalang
|
||||
|
||||
|
||||
def find_files(directory):
|
||||
files_list = []
|
||||
for root, dirs, files in os.walk(directory):
|
||||
@ -9,6 +11,7 @@ def find_files(directory):
|
||||
files_list.append(os.path.join(root, file))
|
||||
return files_list
|
||||
|
||||
|
||||
def extract_functions(file_path):
|
||||
with open(file_path, "r") as file:
|
||||
java_code = file.read()
|
||||
@ -28,6 +31,7 @@ def extract_functions(file_path):
|
||||
methods[method_name] = method_source_code
|
||||
return methods
|
||||
|
||||
|
||||
def extract_classes(file_path):
|
||||
with open(file_path, 'r') as file:
|
||||
source_code = file.read()
|
||||
@ -47,6 +51,7 @@ def extract_classes(file_path):
|
||||
classes[class_name] = class_string
|
||||
return classes
|
||||
|
||||
|
||||
def extract_functions_and_classes(directory):
|
||||
files = find_files(directory)
|
||||
functions_dict = {}
|
||||
@ -58,4 +63,4 @@ def extract_functions_and_classes(directory):
|
||||
classes = extract_classes(file)
|
||||
if classes:
|
||||
classes_dict[file] = classes
|
||||
return functions_dict, classes_dict
|
||||
return functions_dict, classes_dict
|
||||
|
@ -1,6 +1,7 @@
|
||||
import os
|
||||
import esprima
|
||||
|
||||
import escodegen
|
||||
import esprima
|
||||
|
||||
|
||||
def find_files(directory):
|
||||
@ -11,6 +12,7 @@ def find_files(directory):
|
||||
files_list.append(os.path.join(root, file))
|
||||
return files_list
|
||||
|
||||
|
||||
def extract_functions(file_path):
|
||||
with open(file_path, 'r') as file:
|
||||
source_code = file.read()
|
||||
@ -26,7 +28,6 @@ def extract_functions(file_path):
|
||||
func_name = declaration.id.name if declaration.id else '<anonymous>'
|
||||
functions[func_name] = escodegen.generate(declaration.init)
|
||||
elif node.type == 'ClassDeclaration':
|
||||
class_name = node.id.name
|
||||
for subnode in node.body.body:
|
||||
if subnode.type == 'MethodDefinition':
|
||||
func_name = subnode.key.name
|
||||
@ -38,6 +39,7 @@ def extract_functions(file_path):
|
||||
functions[func_name] = escodegen.generate(declaration.init)
|
||||
return functions
|
||||
|
||||
|
||||
def extract_classes(file_path):
|
||||
with open(file_path, 'r') as file:
|
||||
source_code = file.read()
|
||||
@ -53,6 +55,7 @@ def extract_classes(file_path):
|
||||
classes[class_name] = ", ".join(function_names)
|
||||
return classes
|
||||
|
||||
|
||||
def extract_functions_and_classes(directory):
|
||||
files = find_files(directory)
|
||||
functions_dict = {}
|
||||
|
@ -1,32 +1,32 @@
|
||||
import os
|
||||
import faiss
|
||||
import pickle
|
||||
|
||||
import tiktoken
|
||||
from langchain.vectorstores import FAISS
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
|
||||
#from langchain.embeddings import HuggingFaceEmbeddings
|
||||
#from langchain.embeddings import HuggingFaceInstructEmbeddings
|
||||
#from langchain.embeddings import CohereEmbeddings
|
||||
|
||||
from langchain.vectorstores import FAISS
|
||||
from retry import retry
|
||||
|
||||
|
||||
# from langchain.embeddings import HuggingFaceEmbeddings
|
||||
# from langchain.embeddings import HuggingFaceInstructEmbeddings
|
||||
# from langchain.embeddings import CohereEmbeddings
|
||||
|
||||
|
||||
def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
||||
# Function to convert string to tokens and estimate user cost.
|
||||
# Function to convert string to tokens and estimate user cost.
|
||||
encoding = tiktoken.get_encoding(encoding_name)
|
||||
num_tokens = len(encoding.encode(string))
|
||||
total_price = ((num_tokens/1000) * 0.0004)
|
||||
total_price = ((num_tokens / 1000) * 0.0004)
|
||||
return num_tokens, total_price
|
||||
|
||||
|
||||
@retry(tries=10, delay=60)
|
||||
def store_add_texts_with_retry(store, i):
|
||||
store.add_texts([i.page_content], metadatas=[i.metadata])
|
||||
#store_pine.add_texts([i.page_content], metadatas=[i.metadata])
|
||||
# store_pine.add_texts([i.page_content], metadatas=[i.metadata])
|
||||
|
||||
|
||||
def call_openai_api(docs, folder_name, task_status):
|
||||
# Function to create a vector store from the documents and save it to disk.
|
||||
# Function to create a vector store from the documents and save it to disk.
|
||||
|
||||
# create output folder if it doesn't exist
|
||||
if not os.path.exists(f"{folder_name}"):
|
||||
@ -44,7 +44,8 @@ def call_openai_api(docs, folder_name, task_status):
|
||||
# hf = HuggingFaceEmbeddings(model_name=model_name)
|
||||
# store = FAISS.from_documents(docs_test, hf)
|
||||
s1 = len(docs)
|
||||
for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs), bar_format='{l_bar}{bar}| Time Left: {remaining}'):
|
||||
for i in tqdm(docs, desc="Embedding 🦖", unit="docs", total=len(docs),
|
||||
bar_format='{l_bar}{bar}| Time Left: {remaining}'):
|
||||
try:
|
||||
task_status.update_state(state='PROGRESS', meta={'current': int((c1 / s1) * 100)})
|
||||
store_add_texts_with_retry(store, i)
|
||||
@ -58,20 +59,20 @@ def call_openai_api(docs, folder_name, task_status):
|
||||
c1 += 1
|
||||
store.save_local(f"{folder_name}")
|
||||
|
||||
|
||||
def get_user_permission(docs, folder_name):
|
||||
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
|
||||
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
|
||||
# Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
|
||||
#docs_content = (" ".join(docs))
|
||||
# docs_content = (" ".join(docs))
|
||||
docs_content = ""
|
||||
for doc in docs:
|
||||
docs_content += doc.page_content
|
||||
|
||||
|
||||
tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
|
||||
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
|
||||
print(f"Number of Tokens = {format(tokens, ',d')}")
|
||||
print(f"Approx Cost = ${format(total_price, ',.2f')}")
|
||||
#Here we check for user permission before calling the API.
|
||||
# Here we check for user permission before calling the API.
|
||||
user_input = input("Price Okay? (Y/N) \n").lower()
|
||||
if user_input == "y":
|
||||
call_openai_api(docs, folder_name)
|
||||
|
@ -1,10 +1,12 @@
|
||||
import os
|
||||
import ast
|
||||
import tiktoken
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import tiktoken
|
||||
from langchain.llms import OpenAI
|
||||
from langchain.prompts import PromptTemplate
|
||||
|
||||
|
||||
def find_files(directory):
|
||||
files_list = []
|
||||
for root, dirs, files in os.walk(directory):
|
||||
@ -13,6 +15,7 @@ def find_files(directory):
|
||||
files_list.append(os.path.join(root, file))
|
||||
return files_list
|
||||
|
||||
|
||||
def extract_functions(file_path):
|
||||
with open(file_path, 'r') as file:
|
||||
source_code = file.read()
|
||||
@ -25,6 +28,7 @@ def extract_functions(file_path):
|
||||
functions[func_name] = func_def
|
||||
return functions
|
||||
|
||||
|
||||
def extract_classes(file_path):
|
||||
with open(file_path, 'r') as file:
|
||||
source_code = file.read()
|
||||
@ -40,6 +44,7 @@ def extract_classes(file_path):
|
||||
classes[class_name] = ", ".join(function_names)
|
||||
return classes
|
||||
|
||||
|
||||
def extract_functions_and_classes(directory):
|
||||
files = find_files(directory)
|
||||
functions_dict = {}
|
||||
@ -53,11 +58,12 @@ def extract_functions_and_classes(directory):
|
||||
classes_dict[file] = classes
|
||||
return functions_dict, classes_dict
|
||||
|
||||
|
||||
def parse_functions(functions_dict, formats, dir):
|
||||
c1 = len(functions_dict)
|
||||
for i, (source, functions) in enumerate(functions_dict.items(), start=1):
|
||||
print(f"Processing file {i}/{c1}")
|
||||
source_w = source.replace(dir+"/", "").replace("."+formats, ".md")
|
||||
source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
|
||||
subfolders = "/".join(source_w.split("/")[:-1])
|
||||
Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
|
||||
for j, (name, function) in enumerate(functions.items(), start=1):
|
||||
@ -70,18 +76,19 @@ def parse_functions(functions_dict, formats, dir):
|
||||
response = llm(prompt.format(code=function))
|
||||
mode = "a" if Path(f"outputs/{source_w}").exists() else "w"
|
||||
with open(f"outputs/{source_w}", mode) as f:
|
||||
f.write(f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
|
||||
f.write(
|
||||
f"\n\n# Function name: {name} \n\nFunction: \n```\n{function}\n```, \nDocumentation: \n{response}")
|
||||
|
||||
|
||||
def parse_classes(classes_dict, formats, dir):
|
||||
c1 = len(classes_dict)
|
||||
for i, (source, classes) in enumerate(classes_dict.items()):
|
||||
print(f"Processing file {i+1}/{c1}")
|
||||
source_w = source.replace(dir+"/", "").replace("."+formats, ".md")
|
||||
print(f"Processing file {i + 1}/{c1}")
|
||||
source_w = source.replace(dir + "/", "").replace("." + formats, ".md")
|
||||
subfolders = "/".join(source_w.split("/")[:-1])
|
||||
Path(f"outputs/{subfolders}").mkdir(parents=True, exist_ok=True)
|
||||
for name, function_names in classes.items():
|
||||
print(f"Processing Class {i+1}/{c1}")
|
||||
print(f"Processing Class {i + 1}/{c1}")
|
||||
prompt = PromptTemplate(
|
||||
input_variables=["class_name", "functions_names"],
|
||||
template="Class name: {class_name} \nFunctions: {functions_names}, \nDocumentation: ",
|
||||
@ -92,6 +99,7 @@ def parse_classes(classes_dict, formats, dir):
|
||||
with open(f"outputs/{source_w}", "a" if Path(f"outputs/{source_w}").exists() else "w") as f:
|
||||
f.write(f"\n\n# Class name: {name} \n\nFunctions: \n{function_names}, \nDocumentation: \n{response}")
|
||||
|
||||
|
||||
def transform_to_docs(functions_dict, classes_dict, formats, dir):
|
||||
docs_content = ''.join([str(key) + str(value) for key, value in functions_dict.items()])
|
||||
docs_content += ''.join([str(key) + str(value) for key, value in classes_dict.items()])
|
||||
@ -110,4 +118,4 @@ def transform_to_docs(functions_dict, classes_dict, formats, dir):
|
||||
parse_classes(classes_dict, formats, dir)
|
||||
print("All done!")
|
||||
else:
|
||||
print("The API was not called. No money was spent.")
|
||||
print("The API was not called. No money was spent.")
|
||||
|
@ -2,7 +2,6 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
from langchain.docstore.document import Document as LCDocument
|
||||
|
||||
from parser.schema.schema import BaseDocument
|
||||
|
||||
|
||||
|
@ -1,9 +1,9 @@
|
||||
import re
|
||||
import tiktoken
|
||||
|
||||
from typing import List
|
||||
from parser.schema.base import Document
|
||||
from math import ceil
|
||||
from typing import List
|
||||
|
||||
import tiktoken
|
||||
from parser.schema.base import Document
|
||||
|
||||
|
||||
def separate_header_and_body(text):
|
||||
@ -13,6 +13,7 @@ def separate_header_and_body(text):
|
||||
body = text[len(header):]
|
||||
return header, body
|
||||
|
||||
|
||||
def group_documents(documents: List[Document], min_tokens: int, max_tokens: int) -> List[Document]:
|
||||
docs = []
|
||||
current_group = None
|
||||
@ -23,7 +24,8 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
|
||||
if current_group is None:
|
||||
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
|
||||
extra_info=doc.extra_info)
|
||||
elif len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
|
||||
elif len(tiktoken.get_encoding("cl100k_base").encode(
|
||||
current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
|
||||
current_group.text += " " + doc.text
|
||||
else:
|
||||
docs.append(current_group)
|
||||
@ -35,6 +37,7 @@ def group_documents(documents: List[Document], min_tokens: int, max_tokens: int)
|
||||
|
||||
return docs
|
||||
|
||||
|
||||
def split_documents(documents: List[Document], max_tokens: int) -> List[Document]:
|
||||
docs = []
|
||||
for doc in documents:
|
||||
@ -54,17 +57,18 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document
|
||||
docs.append(new_doc)
|
||||
return docs
|
||||
|
||||
|
||||
def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
|
||||
if token_check == False:
|
||||
if not token_check:
|
||||
return documents
|
||||
print("Grouping small documents")
|
||||
try:
|
||||
documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
|
||||
except:
|
||||
except Exception:
|
||||
print("Grouping failed, try running without token_check")
|
||||
print("Separating large documents")
|
||||
try:
|
||||
documents = split_documents(documents=documents, max_tokens=max_tokens)
|
||||
except:
|
||||
except Exception:
|
||||
print("Grouping failed, try running without token_check")
|
||||
return documents
|
||||
|
@ -1,18 +1,17 @@
|
||||
import requests
|
||||
import nltk
|
||||
import os
|
||||
|
||||
from parser.file.bulk import SimpleDirectoryReader
|
||||
from parser.schema.base import Document
|
||||
from parser.open_ai_func import call_openai_api
|
||||
from parser.token_func import group_split
|
||||
from urllib.parse import urljoin
|
||||
from core.settings import settings
|
||||
|
||||
|
||||
import shutil
|
||||
import string
|
||||
import zipfile
|
||||
import shutil
|
||||
from urllib.parse import urljoin
|
||||
|
||||
import nltk
|
||||
import requests
|
||||
|
||||
from core.settings import settings
|
||||
from parser.file.bulk import SimpleDirectoryReader
|
||||
from parser.open_ai_func import call_openai_api
|
||||
from parser.schema.base import Document
|
||||
from parser.token_func import group_split
|
||||
|
||||
try:
|
||||
nltk.download('punkt', quiet=True)
|
||||
@ -50,7 +49,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
with open(full_path + '/' + filename, 'wb') as f:
|
||||
f.write(file)
|
||||
|
||||
#check if file is .zip and extract it
|
||||
# check if file is .zip and extract it
|
||||
if filename.endswith('.zip'):
|
||||
with zipfile.ZipFile(full_path + '/' + filename, 'r') as zip_ref:
|
||||
zip_ref.extractall(full_path)
|
||||
@ -68,7 +67,7 @@ def ingest_worker(self, directory, formats, name_job, filename, user):
|
||||
call_openai_api(docs, full_path, self)
|
||||
self.update_state(state='PROGRESS', meta={'current': 100})
|
||||
|
||||
if sample == True:
|
||||
if sample:
|
||||
for i in range(min(5, len(raw_docs))):
|
||||
print(raw_docs[i].text)
|
||||
|
||||
|
@ -1,4 +1,4 @@
|
||||
from app import app
|
||||
|
||||
if __name__ == "__main__":
|
||||
app.run(debug=True, port=5001)
|
||||
app.run(debug=True, port=5001)
|
||||
|
@ -1,18 +1,20 @@
|
||||
import requests
|
||||
import dotenv
|
||||
import os
|
||||
import json
|
||||
import pprint
|
||||
|
||||
import dotenv
|
||||
import requests
|
||||
from flask import Flask, request
|
||||
|
||||
dotenv.load_dotenv()
|
||||
docsgpt_url = os.getenv("docsgpt_url")
|
||||
chatwoot_url = os.getenv("chatwoot_url")
|
||||
docsgpt_key = os.getenv("docsgpt_key")
|
||||
chatwoot_token = os.getenv("chatwoot_token")
|
||||
#account_id = os.getenv("account_id")
|
||||
#assignee_id = os.getenv("assignee_id")
|
||||
# account_id = os.getenv("account_id")
|
||||
# assignee_id = os.getenv("assignee_id")
|
||||
label_stop = "human-requested"
|
||||
|
||||
|
||||
def send_to_bot(sender, message):
|
||||
data = {
|
||||
'sender': sender,
|
||||
@ -43,7 +45,6 @@ def send_to_chatwoot(account, conversation, message):
|
||||
return r.json()
|
||||
|
||||
|
||||
from flask import Flask, request
|
||||
app = Flask(__name__)
|
||||
|
||||
|
||||
@ -74,7 +75,7 @@ def docsgpt():
|
||||
# elif str(assignee) != str(assignee_id):
|
||||
# return "Not the right assignee"
|
||||
|
||||
if(message_type == "incoming"):
|
||||
if (message_type == "incoming"):
|
||||
bot_response = send_to_bot(contact, message)
|
||||
create_message = send_to_chatwoot(
|
||||
account, conversation, bot_response)
|
||||
@ -83,5 +84,6 @@ def docsgpt():
|
||||
|
||||
return create_message
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(host='0.0.0.0', port=80)
|
||||
app.run(host='0.0.0.0', port=80)
|
||||
|
@ -1,17 +1,10 @@
|
||||
from pathlib import Path
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
import faiss
|
||||
from langchain.vectorstores import FAISS
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.llms import OpenAI
|
||||
from langchain.prompts import PromptTemplate
|
||||
import pickle
|
||||
import dotenv
|
||||
import tiktoken
|
||||
import sys
|
||||
from argparse import ArgumentParser
|
||||
import ast
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import dotenv
|
||||
from langchain.llms import OpenAI
|
||||
from langchain.prompts import PromptTemplate
|
||||
|
||||
dotenv.load_dotenv()
|
||||
|
||||
@ -24,12 +17,6 @@ for p in ps:
|
||||
sources.append(p)
|
||||
|
||||
|
||||
# with open('inputs/client.py', 'r') as f:
|
||||
# tree = ast.parse(f.read())
|
||||
|
||||
# print(tree)
|
||||
|
||||
|
||||
def get_functions_in_class(node):
|
||||
functions = []
|
||||
functions_code = []
|
||||
@ -66,16 +53,6 @@ for code in data:
|
||||
with open('structure_dict.json', 'w') as f:
|
||||
json.dump(structure_dict, f)
|
||||
|
||||
# llm = OpenAI(temperature=0)
|
||||
# prompt = PromptTemplate(
|
||||
# input_variables=["code"],
|
||||
# template="Code: {code}, Documentation: ",
|
||||
# )
|
||||
#
|
||||
# print(prompt.format(code="print('hello world')"))
|
||||
# print(llm(prompt.format(code="print('hello world')")))
|
||||
|
||||
|
||||
if not Path("outputs").exists():
|
||||
Path("outputs").mkdir()
|
||||
|
||||
|
@ -1,19 +1,19 @@
|
||||
import os
|
||||
import sys
|
||||
import nltk
|
||||
import dotenv
|
||||
import typer
|
||||
|
||||
from collections import defaultdict
|
||||
from typing import List, Optional
|
||||
|
||||
import dotenv
|
||||
import nltk
|
||||
import typer
|
||||
|
||||
from parser.file.bulk import SimpleDirectoryReader
|
||||
from parser.schema.base import Document
|
||||
from parser.open_ai_func import call_openai_api, get_user_permission
|
||||
from parser.py2doc import transform_to_docs
|
||||
from parser.py2doc import extract_functions_and_classes as extract_py
|
||||
from parser.js2doc import extract_functions_and_classes as extract_js
|
||||
from parser.java2doc import extract_functions_and_classes as extract_java
|
||||
from parser.js2doc import extract_functions_and_classes as extract_js
|
||||
from parser.open_ai_func import call_openai_api, get_user_permission
|
||||
from parser.py2doc import extract_functions_and_classes as extract_py
|
||||
from parser.py2doc import transform_to_docs
|
||||
from parser.schema.base import Document
|
||||
from parser.token_func import group_split
|
||||
|
||||
dotenv.load_dotenv()
|
||||
@ -38,7 +38,8 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
|
||||
limit: Optional[int] = typer.Option(None, help="Maximum number of files to read."),
|
||||
formats: Optional[List[str]] = typer.Option([".rst", ".md"],
|
||||
help="""List of required extensions (list with .)
|
||||
Currently supported: .rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""),
|
||||
Currently supported:
|
||||
.rst, .md, .pdf, .docx, .csv, .epub, .html, .mdx"""),
|
||||
exclude: Optional[bool] = typer.Option(True, help="Whether to exclude hidden files (dotfiles)."),
|
||||
sample: Optional[bool] = typer.Option(False,
|
||||
help="Whether to output sample of the first 5 split documents."),
|
||||
@ -65,7 +66,7 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
|
||||
# docs = text_splitter.split_documents(raw_docs)
|
||||
|
||||
# Sample feature
|
||||
if sample == True:
|
||||
if sample:
|
||||
for i in range(min(5, len(raw_docs))):
|
||||
print(raw_docs[i].text)
|
||||
|
||||
|
@ -1,38 +1,42 @@
|
||||
from pathlib import Path
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
import faiss
|
||||
from langchain.vectorstores import FAISS
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
import pickle
|
||||
import dotenv
|
||||
import tiktoken
|
||||
import sys
|
||||
from argparse import ArgumentParser
|
||||
from pathlib import Path
|
||||
|
||||
import dotenv
|
||||
import faiss
|
||||
import tiktoken
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
from langchain.vectorstores import FAISS
|
||||
|
||||
|
||||
def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
||||
# Function to convert string to tokens and estimate user cost.
|
||||
# Function to convert string to tokens and estimate user cost.
|
||||
encoding = tiktoken.get_encoding(encoding_name)
|
||||
num_tokens = len(encoding.encode(string))
|
||||
total_price = ((num_tokens/1000) * 0.0004)
|
||||
total_price = ((num_tokens / 1000) * 0.0004)
|
||||
return num_tokens, total_price
|
||||
|
||||
|
||||
def call_openai_api():
|
||||
# Function to create a vector store from the documents and save it to disk.
|
||||
# Function to create a vector store from the documents and save it to disk.
|
||||
store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
|
||||
faiss.write_index(store.index, "docs.index")
|
||||
store.index = None
|
||||
with open("faiss_store.pkl", "wb") as f:
|
||||
pickle.dump(store, f)
|
||||
|
||||
|
||||
def get_user_permission():
|
||||
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
|
||||
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
|
||||
# Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
|
||||
docs_content = (" ".join(docs))
|
||||
tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
|
||||
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
|
||||
print(f"Number of Tokens = {format(tokens, ',d')}")
|
||||
print(f"Approx Cost = ${format(total_price, ',.2f')}")
|
||||
#Here we check for user permission before calling the API.
|
||||
# Here we check for user permission before calling the API.
|
||||
user_input = input("Price Okay? (Y/N) \n").lower()
|
||||
if user_input == "y":
|
||||
call_openai_api()
|
||||
@ -41,7 +45,8 @@ def get_user_permission():
|
||||
else:
|
||||
print("The API was not called. No money was spent.")
|
||||
|
||||
#Load .env file
|
||||
|
||||
# Load .env file
|
||||
dotenv.load_dotenv()
|
||||
|
||||
ap = ArgumentParser("Script for training DocsGPT on .rst documentation files.")
|
||||
|
@ -1,71 +1,75 @@
|
||||
import os
|
||||
import pickle
|
||||
import dotenv
|
||||
import tiktoken
|
||||
import sys
|
||||
import faiss
|
||||
import shutil
|
||||
import sys
|
||||
from argparse import ArgumentParser
|
||||
from pathlib import Path
|
||||
from langchain.vectorstores import FAISS
|
||||
|
||||
import dotenv
|
||||
import faiss
|
||||
import tiktoken
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.text_splitter import CharacterTextSplitter
|
||||
from langchain.vectorstores import FAISS
|
||||
from sphinx.cmd.build import main as sphinx_main
|
||||
from argparse import ArgumentParser
|
||||
|
||||
|
||||
def convert_rst_to_txt(src_dir, dst_dir):
|
||||
# Check if the source directory exists
|
||||
if not os.path.exists(src_dir):
|
||||
raise Exception("Source directory does not exist")
|
||||
# Walk through the source directory
|
||||
for root, dirs, files in os.walk(src_dir):
|
||||
for file in files:
|
||||
# Check if the file has .rst extension
|
||||
if file.endswith(".rst"):
|
||||
# Construct the full path of the file
|
||||
src_file = os.path.join(root, file.replace(".rst", ""))
|
||||
# Convert the .rst file to .txt file using sphinx-build
|
||||
args = f". -b text -D extensions=sphinx.ext.autodoc " \
|
||||
f"-D master_doc={src_file} " \
|
||||
f"-D source_suffix=.rst " \
|
||||
f"-C {dst_dir} "
|
||||
sphinx_main(args.split())
|
||||
elif file.endswith(".md"):
|
||||
# Rename the .md file to .rst file
|
||||
src_file = os.path.join(root, file)
|
||||
dst_file = os.path.join(root, file.replace(".md", ".rst"))
|
||||
os.rename(src_file, dst_file)
|
||||
# Convert the .rst file to .txt file using sphinx-build
|
||||
args = f". -b text -D extensions=sphinx.ext.autodoc " \
|
||||
f"-D master_doc={dst_file} " \
|
||||
f"-D source_suffix=.rst " \
|
||||
f"-C {dst_dir} "
|
||||
sphinx_main(args.split())
|
||||
# Check if the source directory exists
|
||||
if not os.path.exists(src_dir):
|
||||
raise Exception("Source directory does not exist")
|
||||
# Walk through the source directory
|
||||
for root, dirs, files in os.walk(src_dir):
|
||||
for file in files:
|
||||
# Check if the file has .rst extension
|
||||
if file.endswith(".rst"):
|
||||
# Construct the full path of the file
|
||||
src_file = os.path.join(root, file.replace(".rst", ""))
|
||||
# Convert the .rst file to .txt file using sphinx-build
|
||||
args = f". -b text -D extensions=sphinx.ext.autodoc " \
|
||||
f"-D master_doc={src_file} " \
|
||||
f"-D source_suffix=.rst " \
|
||||
f"-C {dst_dir} "
|
||||
sphinx_main(args.split())
|
||||
elif file.endswith(".md"):
|
||||
# Rename the .md file to .rst file
|
||||
src_file = os.path.join(root, file)
|
||||
dst_file = os.path.join(root, file.replace(".md", ".rst"))
|
||||
os.rename(src_file, dst_file)
|
||||
# Convert the .rst file to .txt file using sphinx-build
|
||||
args = f". -b text -D extensions=sphinx.ext.autodoc " \
|
||||
f"-D master_doc={dst_file} " \
|
||||
f"-D source_suffix=.rst " \
|
||||
f"-C {dst_dir} "
|
||||
sphinx_main(args.split())
|
||||
|
||||
|
||||
def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
||||
# Function to convert string to tokens and estimate user cost.
|
||||
# Function to convert string to tokens and estimate user cost.
|
||||
encoding = tiktoken.get_encoding(encoding_name)
|
||||
num_tokens = len(encoding.encode(string))
|
||||
total_price = ((num_tokens/1000) * 0.0004)
|
||||
total_price = ((num_tokens / 1000) * 0.0004)
|
||||
return num_tokens, total_price
|
||||
|
||||
|
||||
def call_openai_api():
|
||||
# Function to create a vector store from the documents and save it to disk.
|
||||
# Function to create a vector store from the documents and save it to disk.
|
||||
store = FAISS.from_texts(docs, OpenAIEmbeddings(), metadatas=metadatas)
|
||||
faiss.write_index(store.index, "docs.index")
|
||||
store.index = None
|
||||
with open("faiss_store.pkl", "wb") as f:
|
||||
pickle.dump(store, f)
|
||||
|
||||
|
||||
def get_user_permission():
|
||||
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
|
||||
# Function to ask user permission to call the OpenAI api and spend their OpenAI funds.
|
||||
# Here we convert the docs list to a string and calculate the number of OpenAI tokens the string represents.
|
||||
docs_content = (" ".join(docs))
|
||||
tokens, total_price = num_tokens_from_string(string=docs_content, encoding_name="cl100k_base")
|
||||
# Here we print the number of tokens and the approx user cost with some visually appealing formatting.
|
||||
print(f"Number of Tokens = {format(tokens, ',d')}")
|
||||
print(f"Approx Cost = ${format(total_price, ',.2f')}")
|
||||
#Here we check for user permission before calling the API.
|
||||
# Here we check for user permission before calling the API.
|
||||
user_input = input("Price Okay? (Y/N) \n").lower()
|
||||
if user_input == "y":
|
||||
call_openai_api()
|
||||
@ -74,6 +78,7 @@ def get_user_permission():
|
||||
else:
|
||||
print("The API was not called. No money was spent.")
|
||||
|
||||
|
||||
ap = ArgumentParser("Script for training DocsGPT on Sphinx documentation")
|
||||
ap.add_argument("-i", "--inputs",
|
||||
type=str,
|
||||
@ -81,17 +86,17 @@ ap.add_argument("-i", "--inputs",
|
||||
help="Directory containing documentation files")
|
||||
args = ap.parse_args()
|
||||
|
||||
#Load .env file
|
||||
# Load .env file
|
||||
dotenv.load_dotenv()
|
||||
|
||||
#Directory to vector
|
||||
# Directory to vector
|
||||
src_dir = args.inputs
|
||||
dst_dir = "tmp"
|
||||
|
||||
convert_rst_to_txt(src_dir, dst_dir)
|
||||
|
||||
# Here we load in the data in the format that Notion exports it in.
|
||||
ps = list(Path("tmp/"+ src_dir).glob("**/*.txt"))
|
||||
ps = list(Path("tmp/" + src_dir).glob("**/*.txt"))
|
||||
|
||||
# parse all child directories
|
||||
data = []
|
||||
|
@ -3,7 +3,6 @@ from abc import abstractmethod
|
||||
from typing import Any, List
|
||||
|
||||
from langchain.docstore.document import Document as LCDocument
|
||||
|
||||
from parser.schema.base import Document
|
||||
|
||||
|
||||
|
@ -24,12 +24,11 @@ class HTMLParser(BaseParser):
|
||||
Union[str, List[str]]: a string or a List of strings.
|
||||
"""
|
||||
try:
|
||||
import unstructured
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.staging.base import convert_to_isd
|
||||
from unstructured.cleaners.core import clean
|
||||
except ImportError:
|
||||
raise ValueError("unstructured package is required to parse HTML files.")
|
||||
from unstructured.partition.html import partition_html
|
||||
from unstructured.staging.base import convert_to_isd
|
||||
from unstructured.cleaners.core import clean
|
||||
|
||||
# Using the unstructured library to convert the html to isd format
|
||||
# isd sample : isd = [
|
||||
@ -70,7 +69,8 @@ class HTMLParser(BaseParser):
|
||||
Chunks.append([])
|
||||
Chunks[-1].append(isd_el['text'])
|
||||
|
||||
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25 #TODO: This value can be an user defined variable
|
||||
# Removing all the chunks with sum of lenth of all the strings in the chunk < 25
|
||||
# TODO: This value can be a user defined variable
|
||||
for chunk in Chunks:
|
||||
# sum of lenth of all the strings in the chunk
|
||||
sum = 0
|
||||
|
@ -7,8 +7,8 @@ import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
||||
|
||||
from parser.file.base_parser import BaseParser
|
||||
import tiktoken
|
||||
from parser.file.base_parser import BaseParser
|
||||
|
||||
|
||||
class MarkdownParser(BaseParser):
|
||||
|
@ -5,10 +5,9 @@ Contains parser for md files.
|
||||
"""
|
||||
import re
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union, cast
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from parser.file.base_parser import BaseParser
|
||||
import tiktoken
|
||||
|
||||
|
||||
class RstParser(BaseParser):
|
||||
|
@ -1,4 +1,5 @@
|
||||
import os
|
||||
|
||||
import javalang
|
||||
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
import os
|
||||
import esprima
|
||||
|
||||
import escodegen
|
||||
import esprima
|
||||
|
||||
|
||||
def find_files(directory):
|
||||
@ -27,7 +28,6 @@ def extract_functions(file_path):
|
||||
func_name = declaration.id.name if declaration.id else '<anonymous>'
|
||||
functions[func_name] = escodegen.generate(declaration.init)
|
||||
elif node.type == 'ClassDeclaration':
|
||||
class_name = node.id.name
|
||||
for subnode in node.body.body:
|
||||
if subnode.type == 'MethodDefinition':
|
||||
func_name = subnode.key.name
|
||||
|
@ -1,16 +1,15 @@
|
||||
import os
|
||||
import faiss
|
||||
import pickle
|
||||
|
||||
import tiktoken
|
||||
from langchain.vectorstores import FAISS
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.vectorstores import FAISS
|
||||
from retry import retry
|
||||
|
||||
|
||||
# from langchain.embeddings import HuggingFaceEmbeddings
|
||||
# from langchain.embeddings import HuggingFaceInstructEmbeddings
|
||||
# from langchain.embeddings import CohereEmbeddings
|
||||
|
||||
from retry import retry
|
||||
|
||||
|
||||
def num_tokens_from_string(string: str, encoding_name: str) -> int:
|
||||
# Function to convert string to tokens and estimate user cost.
|
||||
|
@ -1,7 +1,8 @@
|
||||
import os
|
||||
import ast
|
||||
import tiktoken
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import tiktoken
|
||||
from langchain.llms import OpenAI
|
||||
from langchain.prompts import PromptTemplate
|
||||
|
||||
|
@ -2,7 +2,6 @@
|
||||
from dataclasses import dataclass
|
||||
|
||||
from langchain.docstore.document import Document as LCDocument
|
||||
|
||||
from parser.schema.schema import BaseDocument
|
||||
|
||||
|
||||
|
@ -1,9 +1,9 @@
|
||||
import re
|
||||
import tiktoken
|
||||
|
||||
from typing import List
|
||||
from parser.schema.base import Document
|
||||
from math import ceil
|
||||
from typing import List
|
||||
|
||||
import tiktoken
|
||||
from parser.schema.base import Document
|
||||
|
||||
|
||||
def separate_header_and_body(text):
|
||||
@ -59,16 +59,16 @@ def split_documents(documents: List[Document], max_tokens: int) -> List[Document
|
||||
|
||||
|
||||
def group_split(documents: List[Document], max_tokens: int = 2000, min_tokens: int = 150, token_check: bool = True):
|
||||
if token_check == False:
|
||||
if not token_check:
|
||||
return documents
|
||||
print("Grouping small documents")
|
||||
try:
|
||||
documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
|
||||
except:
|
||||
except Exception:
|
||||
print("Grouping failed, try running without token_check")
|
||||
print("Separating large documents")
|
||||
try:
|
||||
documents = split_documents(documents=documents, max_tokens=max_tokens)
|
||||
except:
|
||||
except Exception:
|
||||
print("Grouping failed, try running without token_check")
|
||||
return documents
|
||||
|
Loading…
Reference in New Issue
Block a user