Merge branch 'main' into feature/streaming

pull/251/head
Alex 1 year ago committed by GitHub
commit 3b8039a580
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -24,9 +24,11 @@ from langchain.prompts.chat import (
ChatPromptTemplate,
SystemMessagePromptTemplate,
HumanMessagePromptTemplate,
AIMessagePromptTemplate,
)
from pymongo import MongoClient
from werkzeug.utils import secure_filename
from langchain.llms import GPT4All
from core.settings import settings
from error import bad_request
@ -108,6 +110,7 @@ def run_async_chain(chain, question, chat_history):
result["answer"] = answer
return result
def get_vectorstore(data):
if "active_docs" in data:
if data["active_docs"].split("/")[0] == "local":
@ -134,6 +137,7 @@ def get_docsearch(vectorstore, embeddings_key):
docsearch = FAISS.load_local(vectorstore, CohereEmbeddings(cohere_api_key=embeddings_key))
return docsearch
@celery.task(bind=True)
def ingest(self, directory, formats, name_job, filename, user):
resp = ingest_worker(self, directory, formats, name_job, filename, user)
@ -216,17 +220,26 @@ def api_answer():
# Note if you have used other embeddings than OpenAI, you need to change the embeddings
docsearch = get_docsearch(vectorstore, embeddings_key)
c_prompt = PromptTemplate(input_variables=["summaries", "question"], template=template,
template_format="jinja2")
q_prompt = PromptTemplate(input_variables=["context", "question"], template=template_quest,
template_format="jinja2")
if settings.LLM_NAME == "openai_chat":
llm = ChatOpenAI(openai_api_key=api_key) # optional parameter: model_name="gpt-4"
messages_combine = [
SystemMessagePromptTemplate.from_template(chat_combine_template),
HumanMessagePromptTemplate.from_template("{question}")
]
messages_combine = [SystemMessagePromptTemplate.from_template(chat_combine_template)]
if history:
tokens_current_history = 0
tokens_max_history = 1000
#count tokens in history
history.reverse()
for i in history:
if "prompt" in i and "response" in i:
tokens_batch = llm.get_num_tokens(i["prompt"]) + llm.get_num_tokens(i["response"])
if tokens_current_history + tokens_batch < tokens_max_history:
tokens_current_history += tokens_batch
messages_combine.append(HumanMessagePromptTemplate.from_template(i["prompt"]))
messages_combine.append(AIMessagePromptTemplate.from_template(i["response"]))
messages_combine.append(HumanMessagePromptTemplate.from_template("{question}"))
import sys
print(messages_combine, file=sys.stderr)
p_chat_combine = ChatPromptTemplate.from_messages(messages_combine)
elif settings.LLM_NAME == "openai":
llm = OpenAI(openai_api_key=api_key, temperature=0)
@ -236,6 +249,8 @@ def api_answer():
llm = HuggingFaceHub(repo_id="bigscience/bloom", huggingfacehub_api_token=api_key)
elif settings.LLM_NAME == "cohere":
llm = Cohere(model="command-xlarge-nightly", cohere_api_key=api_key)
elif settings.LLM_NAME == "gpt4all":
llm = GPT4All(model=settings.MODEL_PATH)
else:
raise ValueError("unknown LLM model")
@ -251,9 +266,22 @@ def api_answer():
# result = chain({"question": question, "chat_history": chat_history})
# generate async with async generate method
result = run_async_chain(chain, question, chat_history)
elif settings.LLM_NAME == "gpt4all":
question_generator = LLMChain(llm=llm, prompt=CONDENSE_QUESTION_PROMPT)
doc_chain = load_qa_chain(llm, chain_type="map_reduce", combine_prompt=p_chat_combine)
chain = ConversationalRetrievalChain(
retriever=docsearch.as_retriever(k=2),
question_generator=question_generator,
combine_docs_chain=doc_chain,
)
chat_history = []
# result = chain({"question": question, "chat_history": chat_history})
# generate async with async generate method
result = run_async_chain(chain, question, chat_history)
else:
qa_chain = load_qa_chain(llm=llm, chain_type="map_reduce",
combine_prompt=c_prompt, question_prompt=q_prompt)
combine_prompt=chat_combine_template, question_prompt=q_prompt)
chain = VectorDBQA(combine_documents_chain=qa_chain, vectorstore=docsearch, k=3)
result = chain({"query": question})

@ -9,6 +9,7 @@ class Settings(BaseSettings):
CELERY_BROKER_URL: str = "redis://localhost:6379/0"
CELERY_RESULT_BACKEND: str = "redis://localhost:6379/1"
MONGO_URI: str = "mongodb://localhost:27017/docsgpt"
MODEL_PATH: str = "./models/gpt4all-model.bin"
API_URL: str = "http://localhost:5001" # backend url for celery worker

@ -1,5 +1,6 @@
You are a DocsGPT, friendly and helpful AI assistant by Arc53 that provides help with documents. You give thorough answers with code examples if possible.
Use the following pieces of context to help answer the users question. If its not relevant to the question, provide friendly responses.
You have access to chat history, and can use it to help answer the question.
When using code examples, use the following format:
```(language)
(code)

@ -26,10 +26,12 @@ ecdsa==0.18.0
entrypoints==0.4
faiss-cpu==1.7.3
filelock==3.9.0
Flask==2.3.2
Flask==2.2.3
Flask-Cors==3.0.10
frozenlist==1.3.3
geojson==2.5.0
greenlet==2.0.2
gpt4all==0.1.7
hub==3.0.1
huggingface-hub==0.12.1
humbug==0.2.8
@ -39,7 +41,8 @@ Jinja2==3.1.2
jmespath==1.0.1
joblib==1.2.0
kombu==5.2.4
langchain==0.0.126
langchain==0.0.179
loguru==0.6.0
lxml==4.9.2
MarkupSafe==2.1.2
marshmallow==3.19.0

@ -9,7 +9,7 @@ services:
ports:
- "5173:5173"
depends_on:
- backend
- backend
backend:
build: ./application

@ -1,8 +1,5 @@
"""Simple reader that reads files of different formats from a directory."""
import logging
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union
from parser.file.base import BaseReader
from parser.file.base_parser import BaseParser
from parser.file.docs_parser import DocxParser, PDFParser
@ -12,6 +9,8 @@ from parser.file.markdown_parser import MarkdownParser
from parser.file.rst_parser import RstParser
from parser.file.tabular_parser import PandasCSVParser
from parser.schema.base import Document
from pathlib import Path
from typing import Callable, Dict, List, Optional, Union
DEFAULT_FILE_EXTRACTOR: Dict[str, BaseParser] = {
".pdf": PDFParser(),
@ -151,10 +150,15 @@ class SimpleDirectoryReader(BaseReader):
data = f.read()
if isinstance(data, List):
data_list.extend(data)
if self.file_metadata is not None:
for _ in range(len(data)):
metadata_list.append(self.file_metadata(str(input_file)))
else:
data_list.append(str(data))
if self.file_metadata is not None:
metadata_list.append(self.file_metadata(str(input_file)))
if self.file_metadata is not None:
metadata_list.append(self.file_metadata(str(input_file)))
if concatenate:
return [Document("\n".join(data_list))]

Loading…
Cancel
Save