TEMPLATES: Add multi-index templates (#13490)

One that routes and one that fuses --------- Co-authored-by: Erick Friis <erick@langchain.dev>
6 months ago · b4312aac5c
parent 35e04f204b
commit b4312aac5c
16 changed files with 4230 additions and 0 deletions
--- a/templates/rag-multi-index-fusion/.gitignore
+++ b/templates/rag-multi-index-fusion/.gitignore
@ -0,0 +1 @@
+__pycache__
--- a/templates/rag-multi-index-fusion/LICENSE
+++ b/templates/rag-multi-index-fusion/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 LangChain, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/templates/rag-multi-index-fusion/README.md
+++ b/templates/rag-multi-index-fusion/README.md
@ -0,0 +1,73 @@
+# RAG with Mulitple Indexes (Fusion)
+
+A QA application that queries multiple domain-specific retrievers and selects the most relevant documents from across all retrieved results.
+
+## Environment Setup
+
+This application queries PubMed, ArXiv, Wikipedia, and [Kay AI](https://www.kay.ai) (for SEC filings).
+
+You will need to create a free Kay AI account and [get your API key here](https://www.kay.ai).
+Then set environment variable:
+
+```bash
+export KAY_API_KEY="<YOUR_API_KEY>"
+```
+
+## Usage
+
+To use this package, you should first have the LangChain CLI installed:
+
+```shell
+pip install -U langchain-cli
+```
+
+To create a new LangChain project and install this as the only package, you can do:
+
+```shell
+langchain app new my-app --package rag-multi-index-fusion
+```
+
+If you want to add this to an existing project, you can just run:
+
+```shell
+langchain app add rag-multi-index-fusion
+```
+
+And add the following code to your `server.py` file:
+```python
+from rag_multi_index_fusion import chain as rag_multi_index_fusion_chain
+
+add_routes(app, rag_multi_index_fusion_chain, path="/rag-multi-index-fusion")
+```
+
+(Optional) Let's now configure LangSmith. 
+LangSmith will help us trace, monitor and debug LangChain applications. 
+LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/). 
+If you don't have access, you can skip this section
+
+
+```shell
+export LANGCHAIN_TRACING_V2=true
+export LANGCHAIN_API_KEY=<your-api-key>
+export LANGCHAIN_PROJECT=<your-project>  # if not specified, defaults to "default"
+```
+
+If you are inside this directory, then you can spin up a LangServe instance directly by:
+
+```shell
+langchain serve
+```
+
+This will start the FastAPI app with a server is running locally at 
+[http://localhost:8000](http://localhost:8000)
+
+We can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs)
+We can access the playground at [http://127.0.0.1:8000/rag-multi-index-fusion/playground](http://127.0.0.1:8000/rag-multi-index-fusion/playground)  
+
+We can access the template from code with:
+
+```python
+from langserve.client import RemoteRunnable
+
+runnable = RemoteRunnable("http://localhost:8000/rag-multi-index-fusion")
+```
--- a/templates/rag-multi-index-fusion/poetry.lock
+++ b/templates/rag-multi-index-fusion/poetry.lock
--- a/templates/rag-multi-index-fusion/pyproject.toml
+++ b/templates/rag-multi-index-fusion/pyproject.toml
@ -0,0 +1,29 @@
+[tool.poetry]
+name = "rag-multi-index-fusion"
+version = "0.0.1"
+description = ""
+authors = []
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<4.0"
+langchain = ">=0.0.313, <0.1"
+openai = "<2"
+xmltodict = "^0.13.0"
+kay = "^0.1.2"
+wikipedia = "^1.4.0"
+arxiv = "^2.0.0"
+tiktoken = "^0.5.1"
+
+[tool.poetry.group.dev.dependencies]
+langchain-cli = ">=0.0.15"
+fastapi = "^0.104.0"
+sse-starlette = "^1.6.5"
+
+[tool.langserve]
+export_module = "rag_multi_index_fusion"
+export_attr = "chain"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
--- a/templates/rag-multi-index-fusion/rag_multi_index_fusion/init.py
+++ b/templates/rag-multi-index-fusion/rag_multi_index_fusion/init.py
@ -0,0 +1,3 @@
+from rag_multi_index_fusion.chain import chain
+
+__all__ = ["chain"]
--- a/templates/rag-multi-index-fusion/rag_multi_index_fusion/chain.py
+++ b/templates/rag-multi-index-fusion/rag_multi_index_fusion/chain.py
@ -0,0 +1,102 @@
+import numpy as np
+from langchain.chat_models import ChatOpenAI
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.prompts import ChatPromptTemplate
+from langchain.pydantic_v1 import BaseModel
+from langchain.retrievers import (
+    ArxivRetriever,
+    KayAiRetriever,
+    PubMedRetriever,
+    WikipediaRetriever,
+)
+from langchain.schema import StrOutputParser
+from langchain.schema.runnable import (
+    RunnableLambda,
+    RunnableParallel,
+    RunnablePassthrough,
+)
+from langchain.utils.math import cosine_similarity
+
+pubmed = PubMedRetriever(top_k_results=5).with_config(run_name="pubmed")
+arxiv = ArxivRetriever(top_k_results=5).with_config(run_name="arxiv")
+sec = KayAiRetriever.create(
+    dataset_id="company", data_types=["10-K"], num_contexts=5
+).with_config(run_name="sec_filings")
+wiki = WikipediaRetriever(top_k_results=5, doc_content_chars_max=2000).with_config(
+    run_name="wiki"
+)
+
+embeddings = OpenAIEmbeddings()
+
+
+def fuse_retrieved_docs(input):
+    results_map = input["sources"]
+    query = input["question"]
+    embedded_query = embeddings.embed_query(query)
+    names, docs = zip(
+        *((name, doc) for name, docs in results_map.items() for doc in docs)
+    )
+    embedded_docs = embeddings.embed_documents([doc.page_content for doc in docs])
+    similarity = cosine_similarity(
+        [embedded_query],
+        embedded_docs,
+    )
+    most_similar = np.flip(np.argsort(similarity[0]))[:5]
+    return [
+        (
+            names[i],
+            docs[i],
+        )
+        for i in most_similar
+    ]
+
+
+retriever_map = {
+    "medical paper": pubmed,
+    "scientific paper": arxiv,
+    "public company finances report": sec,
+    "general": wiki,
+}
+
+
+def format_named_docs(named_docs):
+    return "\n\n".join(
+        f"Source: {source}\n\n{doc.page_content}" for source, doc in named_docs
+    )
+
+
+system = """Answer the user question. Use the following sources to help \
+answer the question. If you don't know the answer say "I'm not sure, I couldn't \
+find information on {{topic}}."
+
+Sources:
+
+{sources}"""
+prompt = ChatPromptTemplate.from_messages([("system", system), ("human", "{question}")])
+
+retrieve_all = RunnableParallel(
+    {"ArXiv": arxiv, "Wikipedia": wiki, "PubMed": pubmed, "SEC 10-K Forms": sec}
+).with_config(run_name="retrieve_all")
+
+
+class Question(BaseModel):
+    __root__: str
+
+
+chain = (
+    (
+        RunnableParallel(
+            {"question": RunnablePassthrough(), "sources": retrieve_all}
+        ).with_config(run_name="add_sources")
+        | RunnablePassthrough.assign(
+            sources=(
+                RunnableLambda(fuse_retrieved_docs) | format_named_docs
+            ).with_config(run_name="fuse_and_format")
+        ).with_config(run_name="update_sources")
+        | prompt
+        | ChatOpenAI(model="gpt-3.5-turbo-1106")
+        | StrOutputParser()
+    )
+    .with_config(run_name="QA with fused results")
+    .with_types(input_type=Question)
+)
--- a/templates/rag-multi-index-fusion/tests/init.py
+++ b/templates/rag-multi-index-fusion/tests/init.py
--- a/templates/rag-multi-index-router/.gitignore
+++ b/templates/rag-multi-index-router/.gitignore
@ -0,0 +1 @@
+__pycache__
--- a/templates/rag-multi-index-router/LICENSE
+++ b/templates/rag-multi-index-router/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 LangChain, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/templates/rag-multi-index-router/README.md
+++ b/templates/rag-multi-index-router/README.md
@ -0,0 +1,73 @@
+# RAG with Multiple Indexes (Routing)
+
+A QA application that routes between different domain-specific retrievers given a user question.
+
+## Environment Setup
+
+This application queries PubMed, ArXiv, Wikipedia, and [Kay AI](https://www.kay.ai) (for SEC filings).
+
+You will need to create a free Kay AI account and [get your API key here](https://www.kay.ai). 
+Then set environment variable:
+
+```bash
+export KAY_API_KEY="<YOUR_API_KEY>"
+```
+
+## Usage
+
+To use this package, you should first have the LangChain CLI installed:
+
+```shell
+pip install -U langchain-cli
+```
+
+To create a new LangChain project and install this as the only package, you can do:
+
+```shell
+langchain app new my-app --package rag-multi-index-router
+```
+
+If you want to add this to an existing project, you can just run:
+
+```shell
+langchain app add rag-multi-index-router
+```
+
+And add the following code to your `server.py` file:
+```python
+from rag_multi_index_router import chain as rag_multi_index_router_chain
+
+add_routes(app, rag_multi_index_router_chain, path="/rag-multi-index-router")
+```
+
+(Optional) Let's now configure LangSmith. 
+LangSmith will help us trace, monitor and debug LangChain applications. 
+LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/). 
+If you don't have access, you can skip this section
+
+
+```shell
+export LANGCHAIN_TRACING_V2=true
+export LANGCHAIN_API_KEY=<your-api-key>
+export LANGCHAIN_PROJECT=<your-project>  # if not specified, defaults to "default"
+```
+
+If you are inside this directory, then you can spin up a LangServe instance directly by:
+
+```shell
+langchain serve
+```
+
+This will start the FastAPI app with a server is running locally at 
+[http://localhost:8000](http://localhost:8000)
+
+We can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs)
+We can access the playground at [http://127.0.0.1:8000/rag-multi-index-router/playground](http://127.0.0.1:8000/rag-multi-index-router/playground)  
+
+We can access the template from code with:
+
+```python
+from langserve.client import RemoteRunnable
+
+runnable = RemoteRunnable("http://localhost:8000/rag-multi-index-router")
+```
--- a/templates/rag-multi-index-router/poetry.lock
+++ b/templates/rag-multi-index-router/poetry.lock
--- a/templates/rag-multi-index-router/pyproject.toml
+++ b/templates/rag-multi-index-router/pyproject.toml
@ -0,0 +1,29 @@
+[tool.poetry]
+name = "rag-multi-index-router"
+version = "0.0.1"
+description = ""
+authors = []
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<4.0"
+langchain = ">=0.0.313, <0.1"
+openai = "<2"
+xmltodict = "^0.13.0"
+kay = "^0.1.2"
+wikipedia = "^1.4.0"
+arxiv = "^2.0.0"
+tiktoken = "^0.5.1"
+
+[tool.poetry.group.dev.dependencies]
+langchain-cli = ">=0.0.15"
+fastapi = "^0.104.0"
+sse-starlette = "^1.6.5"
+
+[tool.langserve]
+export_module = "rag_multi_index_router"
+export_attr = "chain"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
--- a/templates/rag-multi-index-router/rag_multi_index_router/init.py
+++ b/templates/rag-multi-index-router/rag_multi_index_router/init.py
@ -0,0 +1,3 @@
+from rag_multi_index_router.chain import chain
+
+__all__ = ["chain"]
--- a/templates/rag-multi-index-router/rag_multi_index_router/chain.py
+++ b/templates/rag-multi-index-router/rag_multi_index_router/chain.py
@ -0,0 +1,96 @@
+from operator import itemgetter
+from typing import Literal
+
+from langchain.chat_models import ChatOpenAI
+from langchain.output_parsers.openai_functions import PydanticAttrOutputFunctionsParser
+from langchain.prompts import ChatPromptTemplate
+from langchain.pydantic_v1 import BaseModel, Field
+from langchain.retrievers import (
+    ArxivRetriever,
+    KayAiRetriever,
+    PubMedRetriever,
+    WikipediaRetriever,
+)
+from langchain.schema import StrOutputParser
+from langchain.schema.runnable import (
+    RouterRunnable,
+    RunnableParallel,
+    RunnablePassthrough,
+)
+from langchain.utils.openai_functions import convert_pydantic_to_openai_function
+
+pubmed = PubMedRetriever(top_k_results=5).with_config(run_name="pubmed")
+arxiv = ArxivRetriever(top_k_results=5).with_config(run_name="arxiv")
+sec = KayAiRetriever.create(
+    dataset_id="company", data_types=["10-K"], num_contexts=5
+).with_config(run_name="sec_filings")
+wiki = WikipediaRetriever(top_k_results=5, doc_content_chars_max=2000).with_config(
+    run_name="wiki"
+)
+
+llm = ChatOpenAI(model="gpt-3.5-turbo-1106")
+
+
+class Search(BaseModel):
+    """Search for relevant documents by question topic."""
+
+    question_resource: Literal[
+        "medical paper", "scientific paper", "public company finances report", "general"
+    ] = Field(
+        ...,
+        description=(
+            "The type of resource that would best help answer the user's question. "
+            "If none of the types are relevant return 'general'."
+        ),
+    )
+
+
+classifier = llm.bind(
+    functions=[convert_pydantic_to_openai_function(Search)],
+    function_call={"name": "Search"},
+) | PydanticAttrOutputFunctionsParser(
+    pydantic_schema=Search, attr_name="question_resource"
+)
+
+retriever_map = {
+    "medical paper": pubmed,
+    "scientific paper": arxiv,
+    "public company finances report": sec,
+    "general": wiki,
+}
+router_retriever = RouterRunnable(runnables=retriever_map)
+
+
+def format_docs(docs):
+    return "\n\n".join(f"Source {i}:\n{doc.page_content}" for i, doc in enumerate(docs))
+
+
+system = """Answer the user question. Use the following sources to help \
+answer the question. If you don't know the answer say "I'm not sure, I couldn't \
+find information on {{topic}}."
+
+Sources:
+
+{sources}"""
+prompt = ChatPromptTemplate.from_messages([("system", system), ("human", "{question}")])
+
+
+class Question(BaseModel):
+    __root__: str
+
+
+chain = (
+    (
+        RunnableParallel(
+            {"input": RunnablePassthrough(), "key": classifier}
+        ).with_config(run_name="classify")
+        | RunnableParallel(
+            {"question": itemgetter("input"), "sources": router_retriever | format_docs}
+        ).with_config(run_name="retrieve")
+        | prompt
+        | llm
+        | StrOutputParser()
+    )
+    .with_config(run_name="QA with router")
+    .with_types(input_type=Question)
+)
--- a/templates/rag-multi-index-router/tests/init.py
+++ b/templates/rag-multi-index-router/tests/init.py