langchain/templates/intel-rag-xeon/intel_rag_xeon/chain.py

from langchain.callbacks import streaming_stdout
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceEndpoint
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.vectorstores import VectorStoreRetriever


# Make this look better in the docs.
class Question(BaseModel):
    __root__: str


# Init Embeddings
embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

knowledge_base = Chroma(
    persist_directory="/tmp/xeon_rag_db",
    embedding_function=embedder,
    collection_name="xeon-rag",
)
query = "What was Nike's revenue in 2023?"
docs = knowledge_base.similarity_search(query)
print(docs[0].page_content)
retriever = VectorStoreRetriever(
    vectorstore=knowledge_base, search_type="mmr", search_kwargs={"k": 1, "fetch_k": 5}
)

# Define our prompt
template = """
Use the following pieces of context from retrieved
dataset to answer the question. Do not make up an answer if there is no
context provided to help answer it.

Context:
---------
{context}

---------
Question: {question}
---------

Answer:
"""


prompt = ChatPromptTemplate.from_template(template)


ENDPOINT_URL = "http://localhost:8080"
callbacks = [streaming_stdout.StreamingStdOutCallbackHandler()]
model = HuggingFaceEndpoint(
    endpoint_url=ENDPOINT_URL,
    max_new_tokens=512,
    top_k=10,
    top_p=0.95,
    typical_p=0.95,
    temperature=0.01,
    repetition_penalty=1.03,
    streaming=True,
)

# RAG Chain
chain = (
    RunnableParallel({"context": retriever, "question": RunnablePassthrough()})
    | prompt
    | model
    | StrOutputParser()
).with_types(input_type=Question)
templates: add RAG template for Intel Xeon Scalable Processors (#18424) Description: This template utilizes Chroma and TGI (Text Generation Inference) to execute RAG on the Intel Xeon Scalable Processors. It serves as a demonstration for users, illustrating the deployment of the RAG service on the Intel Xeon Scalable Processors and showcasing the resulting performance enhancements. Issue: None Dependencies: The template contains the poetry project requirements to run this template. CPU TGI batching is WIP. Twitter handle: None --------- Signed-off-by: lvliang-intel <liang1.lv@intel.com> Co-authored-by: Bagatur <22008038+baskaryan@users.noreply.github.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2024-03-29 21:37:32 +00:00			`from langchain.callbacks import streaming_stdout`
			`from langchain_community.embeddings import HuggingFaceEmbeddings`
			`from langchain_community.llms import HuggingFaceEndpoint`
			`from langchain_community.vectorstores import Chroma`
			`from langchain_core.output_parsers import StrOutputParser`
			`from langchain_core.prompts import ChatPromptTemplate`
			`from langchain_core.pydantic_v1 import BaseModel`
			`from langchain_core.runnables import RunnableParallel, RunnablePassthrough`
			`from langchain_core.vectorstores import VectorStoreRetriever`


			`# Make this look better in the docs.`
			`class Question(BaseModel):`
			`__root__: str`


			`# Init Embeddings`
			`embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")`

			`knowledge_base = Chroma(`
			`persist_directory="/tmp/xeon_rag_db",`
			`embedding_function=embedder,`
			`collection_name="xeon-rag",`
			`)`
			`query = "What was Nike's revenue in 2023?"`
			`docs = knowledge_base.similarity_search(query)`
			`print(docs[0].page_content)`
			`retriever = VectorStoreRetriever(`
			`vectorstore=knowledge_base, search_type="mmr", search_kwargs={"k": 1, "fetch_k": 5}`
			`)`

			`# Define our prompt`
			`template = """`
			`Use the following pieces of context from retrieved`
			`dataset to answer the question. Do not make up an answer if there is no`
			`context provided to help answer it.`

			`Context:`
			`---------`
			`{context}`

			`---------`
			`Question: {question}`
			`---------`

			`Answer:`
			`"""`


			`prompt = ChatPromptTemplate.from_template(template)`


			`ENDPOINT_URL = "http://localhost:8080"`
			`callbacks = [streaming_stdout.StreamingStdOutCallbackHandler()]`
			`model = HuggingFaceEndpoint(`
			`endpoint_url=ENDPOINT_URL,`
			`max_new_tokens=512,`
			`top_k=10,`
			`top_p=0.95,`
			`typical_p=0.95,`
			`temperature=0.01,`
			`repetition_penalty=1.03,`
			`streaming=True,`
			`)`

			`# RAG Chain`
			`chain = (`
			`RunnableParallel({"context": retriever, "question": RunnablePassthrough()})`
			`\| prompt`
			`\| model`
			`\| StrOutputParser()`
			`).with_types(input_type=Question)`