You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/templates/rag-timescale-hybrid-search.../rag_timescale_hybrid_search.../chain.py

113 lines
3.2 KiB
Python

# ruff: noqa: E501
import os
from datetime import timedelta
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.chat_models import ChatOpenAI
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.llms import OpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.vectorstores.timescalevector import TimescaleVector
from langchain_core.output_parsers import StrOutputParser
from langchain_core.pydantic_v1 import BaseModel
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from .load_sample_dataset import load_ts_git_dataset
# to enable debug uncomment the following lines:
# from langchain.globals import set_debug
# set_debug(True)
# from dotenv import find_dotenv, load_dotenv
# _ = load_dotenv(find_dotenv())
if os.environ.get("TIMESCALE_SERVICE_URL", None) is None:
raise Exception("Missing `TIMESCALE_SERVICE_URL` environment variable.")
SERVICE_URL = os.environ["TIMESCALE_SERVICE_URL"]
LOAD_SAMPLE_DATA = os.environ.get("LOAD_SAMPLE_DATA", False)
# DATASET SPECIFIC CODE
# Load the sample dataset. You will have to change this to load your own dataset.
collection_name = "timescale_commits"
partition_interval = timedelta(days=7)
if LOAD_SAMPLE_DATA:
load_ts_git_dataset(
SERVICE_URL,
collection_name=collection_name,
num_records=500,
partition_interval=partition_interval,
)
# This will change depending on the metadata stored in your dataset.
document_content_description = "The git log commit summary containing the commit hash, author, date of commit, change summary and change details"
metadata_field_info = [
AttributeInfo(
name="id",
description="A UUID v1 generated from the date of the commit",
type="uuid",
),
AttributeInfo(
# This is a special attribute represent the timestamp of the uuid.
name="__uuid_timestamp",
description="The timestamp of the commit. Specify in YYYY-MM-DDTHH::MM:SSZ format",
type="datetime.datetime",
),
AttributeInfo(
name="author_name",
description="The name of the author of the commit",
type="string",
),
AttributeInfo(
name="author_email",
description="The email address of the author of the commit",
type="string",
),
]
# END DATASET SPECIFIC CODE
embeddings = OpenAIEmbeddings()
vectorstore = TimescaleVector(
embedding=embeddings,
collection_name=collection_name,
service_url=SERVICE_URL,
time_partition_interval=partition_interval,
)
llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(
llm,
vectorstore,
document_content_description,
metadata_field_info,
enable_limit=True,
verbose=True,
)
template = """Answer the question based only on the following context:
{context}
Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI(temperature=0, model="gpt-4")
# RAG chain
chain = (
RunnableParallel({"context": retriever, "question": RunnablePassthrough()})
| prompt
| model
| StrOutputParser()
)
class Question(BaseModel):
__root__: str
chain = chain.with_types(input_type=Question)