From 56ee56736bce55b689ed0c5bf2add7c8ff63be46 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Thu, 26 Oct 2023 17:38:35 -0700 Subject: [PATCH] add template for hyde (#12390) --- templates/hyde/LICENSE | 21 ++++++++++++ templates/hyde/README.md | 9 +++++ templates/hyde/hyde/__init__.py | 0 templates/hyde/hyde/chain.py | 59 ++++++++++++++++++++++++++++++++ templates/hyde/hyde/prompts.py | 19 ++++++++++ templates/hyde/pyproject.toml | 32 +++++++++++++++++ templates/hyde/tests/__init__.py | 0 7 files changed, 140 insertions(+) create mode 100644 templates/hyde/LICENSE create mode 100644 templates/hyde/README.md create mode 100644 templates/hyde/hyde/__init__.py create mode 100644 templates/hyde/hyde/chain.py create mode 100644 templates/hyde/hyde/prompts.py create mode 100644 templates/hyde/pyproject.toml create mode 100644 templates/hyde/tests/__init__.py diff --git a/templates/hyde/LICENSE b/templates/hyde/LICENSE new file mode 100644 index 0000000000..426b650903 --- /dev/null +++ b/templates/hyde/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2023 LangChain, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/templates/hyde/README.md b/templates/hyde/README.md new file mode 100644 index 0000000000..81097a7d7f --- /dev/null +++ b/templates/hyde/README.md @@ -0,0 +1,9 @@ +# HyDE + +Hypothetical Document Embeddings (HyDE) are a method to improve retrieval. +To do this, a hypothetical document is generated for an incoming query. +That document is then embedded, and that embedding is used to look up real documents similar to that hypothetical document. +The idea behind this is that the hypothetical document may be closer in the embedding space than the query. +For a more detailed description, read the full paper [here](https://arxiv.org/abs/2212.10496). + +For this example, we use a simple RAG architecture, although you can easily use this technique in other more complicated architectures. diff --git a/templates/hyde/hyde/__init__.py b/templates/hyde/hyde/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/templates/hyde/hyde/chain.py b/templates/hyde/hyde/chain.py new file mode 100644 index 0000000000..99021257b0 --- /dev/null +++ b/templates/hyde/hyde/chain.py @@ -0,0 +1,59 @@ +from langchain.prompts import ChatPromptTemplate +from langchain.chat_models import ChatOpenAI +from langchain.embeddings import OpenAIEmbeddings +from langchain.schema.output_parser import StrOutputParser +from langchain.schema.runnable import RunnablePassthrough, RunnableParallel +from langchain.vectorstores import Chroma +from hyde.prompts import hyde_prompt + +# Example for document loading (from url), splitting, and creating vectostore + +''' +# Load +from langchain.document_loaders import WebBaseLoader +loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/") +data = loader.load() + +# Split +from langchain.text_splitter import RecursiveCharacterTextSplitter +text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0) +all_splits = text_splitter.split_documents(data) + +# Add to vectorDB +vectorstore = Chroma.from_documents(documents=all_splits, + collection_name="rag-chroma", + embedding=OpenAIEmbeddings(), + ) +retriever = vectorstore.as_retriever() +''' + +# Embed a single document as a test +vectorstore = Chroma.from_texts( + ["harrison worked at kensho"], + collection_name="rag-chroma", + embedding=OpenAIEmbeddings() +) +retriever = vectorstore.as_retriever() + +# RAG prompt +template = """Answer the question based only on the following context: +{context} + +Question: {question} +""" +prompt = ChatPromptTemplate.from_template(template) + +# LLM +model = ChatOpenAI() + +# RAG chain +chain = ( + RunnableParallel({ + # Configure the input, pass it the prompt, pass that to the model, and then the result to the retriever + "context": {"input": RunnablePassthrough()} | hyde_prompt | model | StrOutputParser() | retriever, + "question": RunnablePassthrough() + }) + | prompt + | model + | StrOutputParser() +) diff --git a/templates/hyde/hyde/prompts.py b/templates/hyde/hyde/prompts.py new file mode 100644 index 0000000000..3ffd478163 --- /dev/null +++ b/templates/hyde/hyde/prompts.py @@ -0,0 +1,19 @@ +from langchain.prompts.prompt import PromptTemplate + +# There are a few different templates to choose from +# These are just different ways to generate hypothetical documents +web_search_template = """Please write a passage to answer the question +Question: {input} +Passage:""" +sci_fact_template = """Please write a scientific paper passage to support/refute the claim +Claim: {input} +Passage:""" +fiqa_template = """Please write a financial article passage to answer the question +Question: {input} +Passage:""" +trec_news_template = """Please write a news passage about the topic. +Topic: {input} +Passage:""" + +# For the sake of this example we will use the web search template +hyde_prompt = PromptTemplate.from_template(web_search_template) diff --git a/templates/hyde/pyproject.toml b/templates/hyde/pyproject.toml new file mode 100644 index 0000000000..a678d8fe1f --- /dev/null +++ b/templates/hyde/pyproject.toml @@ -0,0 +1,32 @@ +[tool.poetry] +name = "hyde" +version = "0.0.1" +description = "" +authors = [] +readme = "README.md" + +[tool.poetry.dependencies] +python = ">=3.8.1,<4.0" +langchain = ">=0.0.313, <0.1" +openai = "^0.28.1" + +[tool.poetry.group.dev.dependencies] +poethepoet = "^0.24.1" +langchain-cli = ">=0.0.4" +fastapi = "^0.104.0" +sse-starlette = "^1.6.5" + +[tool.langserve] +export_module = "hyde.chain" +export_attr = "chain" + +[tool.poe.tasks.start] +cmd="uvicorn langchain_cli.dev_scripts:create_demo_server --reload --port $port --host $host" +args = [ + {name = "port", help = "port to run on", default = "8000"}, + {name = "host", help = "host to run on", default = "127.0.0.1"} +] + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/templates/hyde/tests/__init__.py b/templates/hyde/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2