FEATURE gpt researcher template (#13062)

Co-authored-by: Erick Friis <erick@langchain.dev>
7 months ago · 2ff30b50f2
parent 280ecfd8eb
commit 2ff30b50f2
10 changed files with 2567 additions and 0 deletions
--- a/templates/research-assistant/LICENSE
+++ b/templates/research-assistant/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 LangChain, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/templates/research-assistant/README.md
+++ b/templates/research-assistant/README.md
@ -0,0 +1,75 @@
+# research-assistant
+
+This template implements a version of  
+[GPT Researcher](https://github.com/assafelovic/gpt-researcher) that you can use
+as a starting point for a research agent.
+
+## Environment Setup
+
+The default template relies on ChatOpenAI and DuckDuckGo, so you will need the 
+following environment variable:
+
+- `OPENAI_API_KEY`
+
+And to use the Tavily LLM-optimized search engine, you will need:
+
+- `TAVILY_API_KEY`
+
+## Usage
+
+To use this package, you should first have the LangChain CLI installed:
+
+```shell
+pip install -U langchain-cli
+```
+
+To create a new LangChain project and install this as the only package, you can do:
+
+```shell
+langchain app new my-app --package research-assistant
+```
+
+If you want to add this to an existing project, you can just run:
+
+```shell
+langchain app add research-assistant
+```
+
+And add the following code to your `server.py` file:
+```python
+from research_assistant import chain as research_assistant_chain
+
+add_routes(app, research_assistant_chain, path="/research-assistant")
+```
+
+(Optional) Let's now configure LangSmith. 
+LangSmith will help us trace, monitor and debug LangChain applications. 
+LangSmith is currently in private beta, you can sign up [here](https://smith.langchain.com/). 
+If you don't have access, you can skip this section
+
+
+```shell
+export LANGCHAIN_TRACING_V2=true
+export LANGCHAIN_API_KEY=<your-api-key>
+export LANGCHAIN_PROJECT=<your-project>  # if not specified, defaults to "default"
+```
+
+If you are inside this directory, then you can spin up a LangServe instance directly by:
+
+```shell
+langchain serve
+```
+
+This will start the FastAPI app with a server is running locally at 
+[http://localhost:8000](http://localhost:8000)
+
+We can see all templates at [http://127.0.0.1:8000/docs](http://127.0.0.1:8000/docs)
+We can access the playground at [http://127.0.0.1:8000/research-assistant/playground](http://127.0.0.1:8000/research-assistant/playground)  
+
+We can access the template from code with:
+
+```python
+from langserve.client import RemoteRunnable
+
+runnable = RemoteRunnable("http://localhost:8000/research-assistant")
+```
--- a/templates/research-assistant/poetry.lock
+++ b/templates/research-assistant/poetry.lock
--- a/templates/research-assistant/pyproject.toml
+++ b/templates/research-assistant/pyproject.toml
@ -0,0 +1,27 @@
+[tool.poetry]
+name = "research-assistant"
+version = "0.0.1"
+description = ""
+authors = []
+readme = "README.md"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<4.0"
+langchain = ">=0.0.313, <0.1"
+openai = "^0.28.1"
+beautifulsoup4 = "^4.12.2"
+duckduckgo-search = "^3.9.5"
+tavily-python = "^0.2.6"
+
+[tool.poetry.group.dev.dependencies]
+langchain-cli = ">=0.0.15"
+fastapi = "^0.104.0"
+sse-starlette = "^1.6.5"
+
+[tool.langserve]
+export_module = "research_assistant"
+export_attr = "chain"
+
+[build-system]
+requires = ["poetry-core"]
+build-backend = "poetry.core.masonry.api"
--- a/templates/research-assistant/research_assistant/init.py
+++ b/templates/research-assistant/research_assistant/init.py
@ -0,0 +1,3 @@
+from research_assistant.chain import chain
+
+__all__ = ["chain"]
--- a/templates/research-assistant/research_assistant/chain.py
+++ b/templates/research-assistant/research_assistant/chain.py
@ -0,0 +1,16 @@
+from langchain.pydantic_v1 import BaseModel
+from langchain.schema.runnable import RunnablePassthrough
+
+from research_assistant.search.web import chain as search_chain
+from research_assistant.writer import chain as writer_chain
+
+chain_notypes = (
+    RunnablePassthrough().assign(research_summary=search_chain) | writer_chain
+)
+
+
+class InputType(BaseModel):
+    question: str
+
+
+chain = chain_notypes.with_types(input_type=InputType)
--- a/templates/research-assistant/research_assistant/search/init.py
+++ b/templates/research-assistant/research_assistant/search/init.py
--- a/templates/research-assistant/research_assistant/search/web.py
+++ b/templates/research-assistant/research_assistant/search/web.py
@ -0,0 +1,180 @@
+import json
+from typing import Any
+
+import requests
+from bs4 import BeautifulSoup
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import ChatPromptTemplate
+from langchain.retrievers.tavily_search_api import TavilySearchAPIRetriever
+from langchain.schema.messages import SystemMessage
+from langchain.schema.output_parser import StrOutputParser
+from langchain.schema.runnable import (
+    ConfigurableField,
+    Runnable,
+    RunnableLambda,
+    RunnableParallel,
+    RunnablePassthrough,
+)
+from langchain.utilities import DuckDuckGoSearchAPIWrapper
+
+RESULTS_PER_QUESTION = 3
+
+ddg_search = DuckDuckGoSearchAPIWrapper()
+
+
+def scrape_text(url: str):
+    # Send a GET request to the webpage
+    try:
+        response = requests.get(url)
+
+        # Check if the request was successful
+        if response.status_code == 200:
+            # Parse the content of the request with BeautifulSoup
+            soup = BeautifulSoup(response.text, "html.parser")
+
+            # Extract all text from the webpage
+            page_text = soup.get_text(separator=" ", strip=True)
+
+            # Print the extracted text
+            return page_text
+        else:
+            return f"Failed to retrieve the webpage: Status code {response.status_code}"
+    except Exception as e:
+        print(e)
+        return f"Failed to retrieve the webpage: {e}"
+
+
+def web_search(query: str, num_results: int):
+    results = ddg_search.results(query, num_results)
+    return [r["link"] for r in results]
+
+
+get_links: Runnable[Any, Any] = (
+    RunnablePassthrough()
+    | RunnableLambda(
+        lambda x: [
+            {"url": url, "question": x["question"]}
+            for url in web_search(query=x["question"], num_results=RESULTS_PER_QUESTION)
+        ]
+    )
+).configurable_alternatives(
+    ConfigurableField("search_engine"),
+    default_key="duckduckgo",
+    tavily=RunnableLambda(lambda x: x["question"])
+    | RunnableParallel(
+        {
+            "question": RunnablePassthrough(),
+            "results": TavilySearchAPIRetriever(k=RESULTS_PER_QUESTION),
+        }
+    )
+    | RunnableLambda(
+        lambda x: [
+            {"url": result.metadata["source"], "question": x["question"]}
+            for result in x["results"]
+        ]
+    ),
+)
+
+
+SEARCH_PROMPT = ChatPromptTemplate.from_messages(
+    [
+        ("system", "{agent_prompt}"),
+        (
+            "user",
+            "Write 3 google search queries to search online that form an "
+            "objective opinion from the following: {question}\n"
+            "You must respond with a list of strings in the following format: "
+            '["query 1", "query 2", "query 3"].',
+        ),
+    ]
+)
+
+AUTO_AGENT_INSTRUCTIONS = """
+This task involves researching a given topic, regardless of its complexity or the availability of a definitive answer. The research is conducted by a specific agent, defined by its type and role, with each agent requiring distinct instructions.
+Agent
+The agent is determined by the field of the topic and the specific name of the agent that could be utilized to research the topic provided. Agents are categorized by their area of expertise, and each agent type is associated with a corresponding emoji.
+
+examples:
+task: "should I invest in apple stocks?"
+response: 
+{
+    "agent": "💰 Finance Agent",
+    "agent_role_prompt: "You are a seasoned finance analyst AI assistant. Your primary goal is to compose comprehensive, astute, impartial, and methodically arranged financial reports based on provided data and trends."
+}
+task: "could reselling sneakers become profitable?"
+response: 
+{ 
+    "agent":  "📈 Business Analyst Agent",
+    "agent_role_prompt": "You are an experienced AI business analyst assistant. Your main objective is to produce comprehensive, insightful, impartial, and systematically structured business reports based on provided business data, market trends, and strategic analysis."
+}
+task: "what are the most interesting sites in Tel Aviv?"
+response:
+{
+    "agent:  "🌍 Travel Agent",
+    "agent_role_prompt": "You are a world-travelled AI tour guide assistant. Your main purpose is to draft engaging, insightful, unbiased, and well-structured travel reports on given locations, including history, attractions, and cultural insights."
+}
+"""  # noqa: E501
+CHOOSE_AGENT_PROMPT = ChatPromptTemplate.from_messages(
+    [SystemMessage(content=AUTO_AGENT_INSTRUCTIONS), ("user", "task: {task}")]
+)
+
+SUMMARY_TEMPLATE = """{text} 
+
+-----------
+
+Using the above text, answer in short the following question: 
+
+> {question}
+ 
+-----------
+if the question cannot be answered using the text, imply summarize the text. Include all factual information, numbers, stats etc if available."""  # noqa: E501
+SUMMARY_PROMPT = ChatPromptTemplate.from_template(SUMMARY_TEMPLATE)
+
+scrape_and_summarize: Runnable[Any, Any] = (
+    RunnableParallel(
+        {
+            "question": lambda x: x["question"],
+            "text": lambda x: scrape_text(x["url"])[:10000],
+            "url": lambda x: x["url"],
+        }
+    )
+    | RunnableParallel(
+        {
+            "summary": SUMMARY_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),
+            "url": lambda x: x["url"],
+        }
+    )
+    | RunnableLambda(lambda x: f"Source Url: {x['url']}\nSummary: {x['summary']}")
+)
+
+multi_search = get_links | scrape_and_summarize.map() | (lambda x: "\n".join(x))
+
+
+def load_json(s):
+    try:
+        return json.loads(s)
+    except Exception:
+        return {}
+
+
+search_query = SEARCH_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser() | load_json
+choose_agent = (
+    CHOOSE_AGENT_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser() | load_json
+)
+
+get_search_queries = (
+    RunnablePassthrough().assign(
+        agent_prompt=RunnableParallel({"task": lambda x: x})
+        | choose_agent
+        | (lambda x: x.get("agent_role_prompt"))
+    )
+    | search_query
+)
+
+
+chain = (
+    get_search_queries
+    | (lambda x: [{"question": q} for q in x])
+    | multi_search.map()
+    | (lambda x: "\n\n".join(x))
+)
--- a/templates/research-assistant/research_assistant/writer.py
+++ b/templates/research-assistant/research_assistant/writer.py
@ -0,0 +1,31 @@
+from langchain.chat_models import ChatOpenAI
+from langchain.prompts import ChatPromptTemplate
+from langchain.schema.output_parser import StrOutputParser
+
+REPORT_TEMPLATE = """Information: 
+--------
+{research_summary}
+--------
+
+Using the above information, answer the following question or topic: "{question}" in a detailed report -- \
+The report should focus on the answer to the question, should be well structured, informative, \
+in depth, with facts and numbers if available and a minimum of 1,200 words.
+
+You should strive to write the report as long as you can using all relevant and necessary information provided.
+You must write the report with markdown syntax.
+You MUST determine your own concrete and valid opinion based on the given information. Do NOT deter to general and meaningless conclusions.
+Write all used source urls at the end of the report, and make sure to not add duplicated sources, but only one reference for each.
+You must write the report in apa format.
+Please do your best, this is very important to my career."""  # noqa: E501
+
+model = ChatOpenAI(temperature=0)
+prompt = ChatPromptTemplate.from_messages(
+    [
+        (
+            "system",
+            "You are an AI critical thinker research assistant. Your sole purpose is to write well written, critically acclaimed, objective and structured reports on given text.",  # noqa: E501
+        ),
+        ("user", REPORT_TEMPLATE),
+    ]
+)
+chain = prompt | model | StrOutputParser()
--- a/templates/research-assistant/tests/init.py
+++ b/templates/research-assistant/tests/init.py