langchain/templates/research-assistant/research_assistant/search/web.py

import json
from typing import Any

import requests
from bs4 import BeautifulSoup
from langchain.retrievers.tavily_search_api import TavilySearchAPIRetriever
from langchain_community.chat_models import ChatOpenAI
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
from langchain_core.messages import SystemMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import (
    ConfigurableField,
    Runnable,
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)

RESULTS_PER_QUESTION = 3

ddg_search = DuckDuckGoSearchAPIWrapper()


def scrape_text(url: str):
    # Send a GET request to the webpage
    try:
        response = requests.get(url)

        # Check if the request was successful
        if response.status_code == 200:
            # Parse the content of the request with BeautifulSoup
            soup = BeautifulSoup(response.text, "html.parser")

            # Extract all text from the webpage
            page_text = soup.get_text(separator=" ", strip=True)

            # Print the extracted text
            return page_text
        else:
            return f"Failed to retrieve the webpage: Status code {response.status_code}"
    except Exception as e:
        print(e)
        return f"Failed to retrieve the webpage: {e}"


def web_search(query: str, num_results: int):
    results = ddg_search.results(query, num_results)
    return [r["link"] for r in results]


get_links: Runnable[Any, Any] = (
    RunnablePassthrough()
    | RunnableLambda(
        lambda x: [
            {"url": url, "question": x["question"]}
            for url in web_search(query=x["question"], num_results=RESULTS_PER_QUESTION)
        ]
    )
).configurable_alternatives(
    ConfigurableField("search_engine"),
    default_key="duckduckgo",
    tavily=RunnableLambda(lambda x: x["question"])
    | RunnableParallel(
        {
            "question": RunnablePassthrough(),
            "results": TavilySearchAPIRetriever(k=RESULTS_PER_QUESTION),
        }
    )
    | RunnableLambda(
        lambda x: [
            {"url": result.metadata["source"], "question": x["question"]}
            for result in x["results"]
        ]
    ),
)


SEARCH_PROMPT = ChatPromptTemplate.from_messages(
    [
        ("system", "{agent_prompt}"),
        (
            "user",
            "Write 3 google search queries to search online that form an "
            "objective opinion from the following: {question}\n"
            "You must respond with a list of strings in the following format: "
            '["query 1", "query 2", "query 3"].',
        ),
    ]
)

AUTO_AGENT_INSTRUCTIONS = """
This task involves researching a given topic, regardless of its complexity or the availability of a definitive answer. The research is conducted by a specific agent, defined by its type and role, with each agent requiring distinct instructions.
Agent
The agent is determined by the field of the topic and the specific name of the agent that could be utilized to research the topic provided. Agents are categorized by their area of expertise, and each agent type is associated with a corresponding emoji.

examples:
task: "should I invest in apple stocks?"
response: 
{
    "agent": "💰 Finance Agent",
    "agent_role_prompt: "You are a seasoned finance analyst AI assistant. Your primary goal is to compose comprehensive, astute, impartial, and methodically arranged financial reports based on provided data and trends."
}
task: "could reselling sneakers become profitable?"
response: 
{ 
    "agent":  "📈 Business Analyst Agent",
    "agent_role_prompt": "You are an experienced AI business analyst assistant. Your main objective is to produce comprehensive, insightful, impartial, and systematically structured business reports based on provided business data, market trends, and strategic analysis."
}
task: "what are the most interesting sites in Tel Aviv?"
response:
{
    "agent:  "🌍 Travel Agent",
    "agent_role_prompt": "You are a world-travelled AI tour guide assistant. Your main purpose is to draft engaging, insightful, unbiased, and well-structured travel reports on given locations, including history, attractions, and cultural insights."
}
"""  # noqa: E501
CHOOSE_AGENT_PROMPT = ChatPromptTemplate.from_messages(
    [SystemMessage(content=AUTO_AGENT_INSTRUCTIONS), ("user", "task: {task}")]
)

SUMMARY_TEMPLATE = """{text} 

-----------

Using the above text, answer in short the following question: 

> {question}
 
-----------
if the question cannot be answered using the text, imply summarize the text. Include all factual information, numbers, stats etc if available."""  # noqa: E501
SUMMARY_PROMPT = ChatPromptTemplate.from_template(SUMMARY_TEMPLATE)

scrape_and_summarize: Runnable[Any, Any] = (
    RunnableParallel(
        {
            "question": lambda x: x["question"],
            "text": lambda x: scrape_text(x["url"])[:10000],
            "url": lambda x: x["url"],
        }
    )
    | RunnableParallel(
        {
            "summary": SUMMARY_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),
            "url": lambda x: x["url"],
        }
    )
    | RunnableLambda(lambda x: f"Source Url: {x['url']}\nSummary: {x['summary']}")
)

multi_search = get_links | scrape_and_summarize.map() | (lambda x: "\n".join(x))


def load_json(s):
    try:
        return json.loads(s)
    except Exception:
        return {}


search_query = SEARCH_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser() | load_json
choose_agent = (
    CHOOSE_AGENT_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser() | load_json
)

get_search_queries = (
    RunnablePassthrough().assign(
        agent_prompt=RunnableParallel({"task": lambda x: x})
        | choose_agent
        | (lambda x: x.get("agent_role_prompt"))
    )
    | search_query
)


chain = (
    get_search_queries
    | (lambda x: [{"question": q} for q in x])
    | multi_search.map()
    | (lambda x: "\n\n".join(x))
)
FEATURE gpt researcher template (#13062) Co-authored-by: Erick Friis <erick@langchain.dev> 2023-11-13 23:52:25 +00:00			`import json`
			`from typing import Any`

			`import requests`
			`from bs4 import BeautifulSoup`
			`from langchain.retrievers.tavily_search_api import TavilySearchAPIRetriever`
docs, community[patch], experimental[patch], langchain[patch], cli[pa… (#15412) …tch]: import models from community ran ```bash git grep -l 'from langchain\.chat_models' \| xargs -L 1 sed -i '' "s/from\ langchain\.chat_models/from\ langchain_community.chat_models/g" git grep -l 'from langchain\.llms' \| xargs -L 1 sed -i '' "s/from\ langchain\.llms/from\ langchain_community.llms/g" git grep -l 'from langchain\.embeddings' \| xargs -L 1 sed -i '' "s/from\ langchain\.embeddings/from\ langchain_community.embeddings/g" git checkout master libs/langchain/tests/unit_tests/llms git checkout master libs/langchain/tests/unit_tests/chat_models git checkout master libs/langchain/tests/unit_tests/embeddings/test_imports.py make format cd libs/langchain; make format cd ../experimental; make format cd ../core; make format ``` 2024-01-02 20:32:16 +00:00			`from langchain_community.chat_models import ChatOpenAI`
langchain[patch], experimental[patch]: update utilities imports (#15438) 2024-01-03 07:18:15 +00:00			`from langchain_community.utilities import DuckDuckGoSearchAPIWrapper`
docs[patch], templates[patch]: Import from core (#14575) Update imports to use core for the low-hanging fruit changes. Ran following ```bash git grep -l 'langchain.schema.runnable' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.runnable/langchain_core.runnables/g' git grep -l 'langchain.schema.output_parser' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.output_parser/langchain_core.output_parsers/g' git grep -l 'langchain.schema.messages' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.messages/langchain_core.messages/g' git grep -l 'langchain.schema.chat_histry' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.chat_history/langchain_core.chat_history/g' git grep -l 'langchain.schema.prompt_template' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.prompt_template/langchain_core.prompts/g' git grep -l 'from langchain.pydantic_v1' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.pydantic_v1/from langchain_core.pydantic_v1/g' git grep -l 'from langchain.tools.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.tools\.base/from langchain_core.tools/g' git grep -l 'from langchain.chat_models.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.chat_models.base/from langchain_core.language_models.chat_models/g' git grep -l 'from langchain.llms.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.llms\.base\ /from langchain_core.language_models.llms\ /g' git grep -l 'from langchain.embeddings.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.embeddings\.base/from langchain_core.embeddings/g' git grep -l 'from langchain.vectorstores.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.vectorstores\.base/from langchain_core.vectorstores/g' git grep -l 'from langchain.agents.tools' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.agents\.tools/from langchain_core.tools/g' git grep -l 'from langchain.schema.output' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.output\ /from langchain_core.outputs\ /g' git grep -l 'from langchain.schema.embeddings' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.embeddings/from langchain_core.embeddings/g' git grep -l 'from langchain.schema.document' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.document/from langchain_core.documents/g' git grep -l 'from langchain.schema.agent' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.agent/from langchain_core.agents/g' git grep -l 'from langchain.schema.prompt ' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.prompt\ /from langchain_core.prompt_values /g' git grep -l 'from langchain.schema.language_model' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.language_model/from langchain_core.language_models/g' ``` 2023-12-12 00:49:10 +00:00			`from langchain_core.messages import SystemMessage`
			`from langchain_core.output_parsers import StrOutputParser`
templates: fix deps (#15439) 2024-01-03 21:28:05 +00:00			`from langchain_core.prompts import ChatPromptTemplate`
docs[patch], templates[patch]: Import from core (#14575) Update imports to use core for the low-hanging fruit changes. Ran following ```bash git grep -l 'langchain.schema.runnable' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.runnable/langchain_core.runnables/g' git grep -l 'langchain.schema.output_parser' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.output_parser/langchain_core.output_parsers/g' git grep -l 'langchain.schema.messages' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.messages/langchain_core.messages/g' git grep -l 'langchain.schema.chat_histry' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.chat_history/langchain_core.chat_history/g' git grep -l 'langchain.schema.prompt_template' {docs,templates,cookbook} \| xargs sed -i '' 's/langchain\.schema\.prompt_template/langchain_core.prompts/g' git grep -l 'from langchain.pydantic_v1' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.pydantic_v1/from langchain_core.pydantic_v1/g' git grep -l 'from langchain.tools.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.tools\.base/from langchain_core.tools/g' git grep -l 'from langchain.chat_models.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.chat_models.base/from langchain_core.language_models.chat_models/g' git grep -l 'from langchain.llms.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.llms\.base\ /from langchain_core.language_models.llms\ /g' git grep -l 'from langchain.embeddings.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.embeddings\.base/from langchain_core.embeddings/g' git grep -l 'from langchain.vectorstores.base' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.vectorstores\.base/from langchain_core.vectorstores/g' git grep -l 'from langchain.agents.tools' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.agents\.tools/from langchain_core.tools/g' git grep -l 'from langchain.schema.output' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.output\ /from langchain_core.outputs\ /g' git grep -l 'from langchain.schema.embeddings' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.embeddings/from langchain_core.embeddings/g' git grep -l 'from langchain.schema.document' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.document/from langchain_core.documents/g' git grep -l 'from langchain.schema.agent' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.agent/from langchain_core.agents/g' git grep -l 'from langchain.schema.prompt ' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.prompt\ /from langchain_core.prompt_values /g' git grep -l 'from langchain.schema.language_model' {docs,templates,cookbook} \| xargs sed -i '' 's/from langchain\.schema\.language_model/from langchain_core.language_models/g' ``` 2023-12-12 00:49:10 +00:00			`from langchain_core.runnables import (`
FEATURE gpt researcher template (#13062) Co-authored-by: Erick Friis <erick@langchain.dev> 2023-11-13 23:52:25 +00:00			`ConfigurableField,`
			`Runnable,`
			`RunnableLambda,`
			`RunnableParallel,`
			`RunnablePassthrough,`
			`)`

			`RESULTS_PER_QUESTION = 3`

			`ddg_search = DuckDuckGoSearchAPIWrapper()`


			`def scrape_text(url: str):`
			`# Send a GET request to the webpage`
			`try:`
			`response = requests.get(url)`

			`# Check if the request was successful`
			`if response.status_code == 200:`
			`# Parse the content of the request with BeautifulSoup`
			`soup = BeautifulSoup(response.text, "html.parser")`

			`# Extract all text from the webpage`
			`page_text = soup.get_text(separator=" ", strip=True)`

			`# Print the extracted text`
			`return page_text`
			`else:`
			`return f"Failed to retrieve the webpage: Status code {response.status_code}"`
			`except Exception as e:`
			`print(e)`
			`return f"Failed to retrieve the webpage: {e}"`


			`def web_search(query: str, num_results: int):`
			`results = ddg_search.results(query, num_results)`
			`return [r["link"] for r in results]`


			`get_links: Runnable[Any, Any] = (`
			`RunnablePassthrough()`
			`\| RunnableLambda(`
			`lambda x: [`
			`{"url": url, "question": x["question"]}`
			`for url in web_search(query=x["question"], num_results=RESULTS_PER_QUESTION)`
			`]`
			`)`
			`).configurable_alternatives(`
			`ConfigurableField("search_engine"),`
			`default_key="duckduckgo",`
			`tavily=RunnableLambda(lambda x: x["question"])`
			`\| RunnableParallel(`
			`{`
			`"question": RunnablePassthrough(),`
			`"results": TavilySearchAPIRetriever(k=RESULTS_PER_QUESTION),`
			`}`
			`)`
			`\| RunnableLambda(`
			`lambda x: [`
			`{"url": result.metadata["source"], "question": x["question"]}`
			`for result in x["results"]`
			`]`
			`),`
			`)`


			`SEARCH_PROMPT = ChatPromptTemplate.from_messages(`
			`[`
			`("system", "{agent_prompt}"),`
			`(`
			`"user",`
			`"Write 3 google search queries to search online that form an "`
			`"objective opinion from the following: {question}\n"`
			`"You must respond with a list of strings in the following format: "`
			`'["query 1", "query 2", "query 3"].',`
			`),`
			`]`
			`)`

			`AUTO_AGENT_INSTRUCTIONS = """`
			`This task involves researching a given topic, regardless of its complexity or the availability of a definitive answer. The research is conducted by a specific agent, defined by its type and role, with each agent requiring distinct instructions.`
			`Agent`
			`The agent is determined by the field of the topic and the specific name of the agent that could be utilized to research the topic provided. Agents are categorized by their area of expertise, and each agent type is associated with a corresponding emoji.`

			`examples:`
			`task: "should I invest in apple stocks?"`
			`response:`
			`{`
			`"agent": "💰 Finance Agent",`
			`"agent_role_prompt: "You are a seasoned finance analyst AI assistant. Your primary goal is to compose comprehensive, astute, impartial, and methodically arranged financial reports based on provided data and trends."`
			`}`
			`task: "could reselling sneakers become profitable?"`
			`response:`
			`{`
			`"agent": "📈 Business Analyst Agent",`
			`"agent_role_prompt": "You are an experienced AI business analyst assistant. Your main objective is to produce comprehensive, insightful, impartial, and systematically structured business reports based on provided business data, market trends, and strategic analysis."`
			`}`
			`task: "what are the most interesting sites in Tel Aviv?"`
			`response:`
			`{`
			`"agent: "🌍 Travel Agent",`
			`"agent_role_prompt": "You are a world-travelled AI tour guide assistant. Your main purpose is to draft engaging, insightful, unbiased, and well-structured travel reports on given locations, including history, attractions, and cultural insights."`
			`}`
			`""" # noqa: E501`
			`CHOOSE_AGENT_PROMPT = ChatPromptTemplate.from_messages(`
			`[SystemMessage(content=AUTO_AGENT_INSTRUCTIONS), ("user", "task: {task}")]`
			`)`

			`SUMMARY_TEMPLATE = """{text}`

			`-----------`

			`Using the above text, answer in short the following question:`

			`> {question}`

			`-----------`
			`if the question cannot be answered using the text, imply summarize the text. Include all factual information, numbers, stats etc if available.""" # noqa: E501`
			`SUMMARY_PROMPT = ChatPromptTemplate.from_template(SUMMARY_TEMPLATE)`

			`scrape_and_summarize: Runnable[Any, Any] = (`
			`RunnableParallel(`
			`{`
			`"question": lambda x: x["question"],`
			`"text": lambda x: scrape_text(x["url"])[:10000],`
			`"url": lambda x: x["url"],`
			`}`
			`)`
			`\| RunnableParallel(`
			`{`
			`"summary": SUMMARY_PROMPT \| ChatOpenAI(temperature=0) \| StrOutputParser(),`
			`"url": lambda x: x["url"],`
			`}`
			`)`
			`\| RunnableLambda(lambda x: f"Source Url: {x['url']}\nSummary: {x['summary']}")`
			`)`

			`multi_search = get_links \| scrape_and_summarize.map() \| (lambda x: "\n".join(x))`


			`def load_json(s):`
			`try:`
			`return json.loads(s)`
			`except Exception:`
			`return {}`


			`search_query = SEARCH_PROMPT \| ChatOpenAI(temperature=0) \| StrOutputParser() \| load_json`
			`choose_agent = (`
			`CHOOSE_AGENT_PROMPT \| ChatOpenAI(temperature=0) \| StrOutputParser() \| load_json`
			`)`

			`get_search_queries = (`
			`RunnablePassthrough().assign(`
			`agent_prompt=RunnableParallel({"task": lambda x: x})`
			`\| choose_agent`
			`\| (lambda x: x.get("agent_role_prompt"))`
			`)`
			`\| search_query`
			`)`


			`chain = (`
			`get_search_queries`
			`\| (lambda x: [{"question": q} for q in x])`
			`\| multi_search.map()`
			`\| (lambda x: "\n\n".join(x))`
			`)`