token ingeest

This commit is contained in:
Pavel 2023-03-14 13:32:29 +04:00
parent 20a0800aa7
commit b6c02c850a
5 changed files with 80 additions and 78 deletions

View File

@ -1,20 +1,16 @@
import os
import re
import sys
import nltk
import dotenv
import typer
import ast
import tiktoken
from math import ceil
from collections import defaultdict
from pathlib import Path
from typing import List, Optional, Tuple
from typing import List, Optional
from langchain.text_splitter import RecursiveCharacterTextSplitter
from parser.file.bulk import SimpleDirectoryReader
from parser.schema.base import Document
from parser.open_ai_func import call_openai_api, get_user_permission
@ -22,6 +18,7 @@ from parser.py2doc import transform_to_docs
from parser.py2doc import extract_functions_and_classes as extract_py
from parser.js2doc import extract_functions_and_classes as extract_js
from parser.java2doc import extract_functions_and_classes as extract_java
from parser.token_func import group_split
dotenv.load_dotenv()
@ -32,57 +29,6 @@ nltk.download('punkt', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
def group_documents(documents: List[Document], min_tokens: int = 50, max_tokens: int = 2000) -> List[Document]:
groups = []
current_group = None
for doc in documents:
doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
if current_group is None:
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
extra_info=doc.extra_info)
elif len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
current_group.text += " " + doc.text
else:
groups.append(current_group)
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
extra_info=doc.extra_info)
if current_group is not None:
groups.append(current_group)
return groups
def separate_header_and_body(text):
header_pattern = r"^(.*?\n){3}"
match = re.match(header_pattern, text)
header = match.group(0)
body = text[len(header):]
return header, body
def split_documents(documents: List[Document], max_tokens: int = 2000) -> List[Document]:
new_documents = []
for doc in documents:
token_length = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
print(token_length)
if token_length <= max_tokens:
new_documents.append(doc)
else:
header, body = separate_header_and_body(doc.text)
num_body_parts = ceil(token_length / max_tokens)
part_length = ceil(len(body) / num_body_parts)
body_parts = [body[i:i + part_length] for i in range(0, len(body), part_length)]
for i, body_part in enumerate(body_parts):
new_doc = Document(text=header + body_part.strip(),
doc_id=f"{doc.doc_id}-{i}",
embedding=doc.embedding,
extra_info=doc.extra_info)
new_documents.append(new_doc)
return new_documents
#Splits all files in specified folder to documents
@app.command()
def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
@ -111,16 +57,15 @@ def ingest(yes: bool = typer.Option(False, "-y", "--yes", prompt=False,
raw_docs = SimpleDirectoryReader(input_dir=directory, input_files=file, recursive=recursive,
required_exts=formats, num_files_limit=limit,
exclude_hidden=exclude).load_data()
#Checking min_tokens and max_tokens
raw_docs = group_split(documents=raw_docs)
raw_docs = group_documents(raw_docs)
raw_docs = split_documents(raw_docs)
print(raw_docs)
raw_docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
docs = [Document.to_langchain_format(raw_doc) for raw_doc in raw_docs]
# Here we split the documents, as needed, into smaller chunks.
# We do this due to the context limits of the LLMs.
text_splitter = RecursiveCharacterTextSplitter()
docs = text_splitter.split_documents(raw_docs)
# text_splitter = RecursiveCharacterTextSplitter()
# docs = text_splitter.split_documents(raw_docs)
# Here we check for command line arguments for bot calls.
# If no argument exists or the yes is not True, then the

Binary file not shown.

View File

@ -29,7 +29,6 @@ class RstParser(BaseParser):
remove_whitespaces_excess: bool = True,
#Be carefull with remove_characters_excess, might cause data loss
remove_characters_excess: bool = True,
# max_tokens: int = 2048,
**kwargs: Any,
) -> None:
"""Init params."""
@ -41,18 +40,6 @@ class RstParser(BaseParser):
self._remove_directives = remove_directives
self._remove_whitespaces_excess = remove_whitespaces_excess
self._remove_characters_excess = remove_characters_excess
# self._max_tokens = max_tokens
# def tups_chunk_append(self, tups: List[Tuple[Optional[str], str]], current_header: Optional[str], current_text: str):
# """Append to tups chunk."""
# num_tokens = len(tiktoken.get_encoding("cl100k_base").encode(current_text))
# if num_tokens > self._max_tokens:
# chunks = [current_text[i:i + self._max_tokens] for i in range(0, len(current_text), self._max_tokens)]
# for chunk in chunks:
# tups.append((current_header, chunk))
# else:
# tups.append((current_header, current_text))
# return tups
def rst_to_tups(self, rst_text: str) -> List[Tuple[Optional[str], str]]:

View File

@ -0,0 +1,70 @@
import re
import tiktoken
from typing import List
from parser.schema.base import Document
from math import ceil
def separate_header_and_body(text):
header_pattern = r"^(.*?\n){3}"
match = re.match(header_pattern, text)
header = match.group(0)
body = text[len(header):]
return header, body
def group_documents(documents: List[Document], min_tokens: int = 200, max_tokens: int = 2000) -> List[Document]:
docs = []
current_group = None
for doc in documents:
doc_len = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
if current_group is None:
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
extra_info=doc.extra_info)
elif len(tiktoken.get_encoding("cl100k_base").encode(current_group.text)) + doc_len < max_tokens and doc_len >= min_tokens:
current_group.text += " " + doc.text
else:
docs.append(current_group)
current_group = Document(text=doc.text, doc_id=doc.doc_id, embedding=doc.embedding,
extra_info=doc.extra_info)
if current_group is not None:
docs.append(current_group)
return docs
def split_documents(documents: List[Document], max_tokens: int = 2000) -> List[Document]:
docs = []
for doc in documents:
token_length = len(tiktoken.get_encoding("cl100k_base").encode(doc.text))
if token_length <= max_tokens:
docs.append(doc)
else:
header, body = separate_header_and_body(doc.text)
num_body_parts = ceil(token_length / max_tokens)
part_length = ceil(len(body) / num_body_parts)
body_parts = [body[i:i + part_length] for i in range(0, len(body), part_length)]
for i, body_part in enumerate(body_parts):
new_doc = Document(text=header + body_part.strip(),
doc_id=f"{doc.doc_id}-{i}",
embedding=doc.embedding,
extra_info=doc.extra_info)
docs.append(new_doc)
return docs
def group_split(documents: List[Document], max_tokens: int = 1500, min_tokens: int = 500, token_check: bool = True):
if token_check == False:
return documents
print("Grouping small documents")
try:
documents = group_documents(documents=documents, min_tokens=min_tokens, max_tokens=max_tokens)
except:
print("Grouping failed, try running without token_check")
print("Separating large documents")
try:
documents = split_documents(documents=documents, max_tokens=max_tokens)
except:
print("Grouping failed, try running without token_check")
return documents

View File

@ -7,7 +7,7 @@ from langchain.embeddings import OpenAIEmbeddings
dotenv.load_dotenv()
embeddings_key = os.getenv("API_KEY")
docsearch = FAISS.load_local('outputs/inputs', OpenAIEmbeddings(openai_api_key=embeddings_key))
docsearch = FAISS.load_local('outputs', OpenAIEmbeddings(openai_api_key=embeddings_key))
d1 = docsearch.similarity_search("Whats new in 1.5.3?")
print(d1)