2023-10-13 21:36:44 +00:00
|
|
|
"""Agent for working with pandas objects."""
|
|
|
|
from typing import Any, Dict, List, Optional, Sequence, Tuple
|
|
|
|
|
|
|
|
from langchain.agents.agent import AgentExecutor, BaseSingleActionAgent
|
|
|
|
from langchain.agents.mrkl.base import ZeroShotAgent
|
|
|
|
from langchain.agents.openai_functions_agent.base import OpenAIFunctionsAgent
|
|
|
|
from langchain.agents.types import AgentType
|
|
|
|
from langchain.callbacks.base import BaseCallbackManager
|
|
|
|
from langchain.chains.llm import LLMChain
|
|
|
|
from langchain.schema import BasePromptTemplate
|
|
|
|
from langchain.tools import BaseTool
|
2024-01-02 20:09:45 +00:00
|
|
|
from langchain_core.language_models import BaseLanguageModel
|
|
|
|
from langchain_core.messages import SystemMessage
|
2023-10-13 21:36:44 +00:00
|
|
|
|
|
|
|
from langchain_experimental.agents.agent_toolkits.pandas.prompt import (
|
|
|
|
FUNCTIONS_WITH_DF,
|
|
|
|
FUNCTIONS_WITH_MULTI_DF,
|
|
|
|
MULTI_DF_PREFIX,
|
|
|
|
MULTI_DF_PREFIX_FUNCTIONS,
|
|
|
|
PREFIX,
|
|
|
|
PREFIX_FUNCTIONS,
|
|
|
|
SUFFIX_NO_DF,
|
|
|
|
SUFFIX_WITH_DF,
|
|
|
|
SUFFIX_WITH_MULTI_DF,
|
|
|
|
)
|
|
|
|
from langchain_experimental.tools.python.tool import PythonAstREPLTool
|
|
|
|
|
|
|
|
|
|
|
|
def _get_multi_prompt(
|
|
|
|
dfs: List[Any],
|
|
|
|
prefix: Optional[str] = None,
|
|
|
|
suffix: Optional[str] = None,
|
|
|
|
input_variables: Optional[List[str]] = None,
|
|
|
|
include_df_in_prompt: Optional[bool] = True,
|
|
|
|
number_of_head_rows: int = 5,
|
2023-12-05 04:54:08 +00:00
|
|
|
extra_tools: Sequence[BaseTool] = (),
|
|
|
|
) -> Tuple[BasePromptTemplate, List[BaseTool]]:
|
2023-10-13 21:36:44 +00:00
|
|
|
num_dfs = len(dfs)
|
|
|
|
if suffix is not None:
|
|
|
|
suffix_to_use = suffix
|
|
|
|
include_dfs_head = True
|
|
|
|
elif include_df_in_prompt:
|
|
|
|
suffix_to_use = SUFFIX_WITH_MULTI_DF
|
|
|
|
include_dfs_head = True
|
|
|
|
else:
|
|
|
|
suffix_to_use = SUFFIX_NO_DF
|
|
|
|
include_dfs_head = False
|
|
|
|
if input_variables is None:
|
|
|
|
input_variables = ["input", "agent_scratchpad", "num_dfs"]
|
|
|
|
if include_dfs_head:
|
|
|
|
input_variables += ["dfs_head"]
|
|
|
|
|
|
|
|
if prefix is None:
|
|
|
|
prefix = MULTI_DF_PREFIX
|
|
|
|
|
|
|
|
df_locals = {}
|
|
|
|
for i, dataframe in enumerate(dfs):
|
|
|
|
df_locals[f"df{i + 1}"] = dataframe
|
2023-12-05 04:54:08 +00:00
|
|
|
tools = [PythonAstREPLTool(locals=df_locals)] + list(extra_tools)
|
2023-10-13 21:36:44 +00:00
|
|
|
prompt = ZeroShotAgent.create_prompt(
|
2023-12-05 04:54:08 +00:00
|
|
|
tools,
|
|
|
|
prefix=prefix,
|
|
|
|
suffix=suffix_to_use,
|
|
|
|
input_variables=input_variables,
|
2023-10-13 21:36:44 +00:00
|
|
|
)
|
|
|
|
partial_prompt = prompt.partial()
|
|
|
|
if "dfs_head" in input_variables:
|
|
|
|
dfs_head = "\n\n".join([d.head(number_of_head_rows).to_markdown() for d in dfs])
|
|
|
|
partial_prompt = partial_prompt.partial(num_dfs=str(num_dfs), dfs_head=dfs_head)
|
|
|
|
if "num_dfs" in input_variables:
|
|
|
|
partial_prompt = partial_prompt.partial(num_dfs=str(num_dfs))
|
|
|
|
return partial_prompt, tools
|
|
|
|
|
|
|
|
|
|
|
|
def _get_single_prompt(
|
|
|
|
df: Any,
|
|
|
|
prefix: Optional[str] = None,
|
|
|
|
suffix: Optional[str] = None,
|
|
|
|
input_variables: Optional[List[str]] = None,
|
|
|
|
include_df_in_prompt: Optional[bool] = True,
|
|
|
|
number_of_head_rows: int = 5,
|
2023-12-05 04:54:08 +00:00
|
|
|
extra_tools: Sequence[BaseTool] = (),
|
|
|
|
) -> Tuple[BasePromptTemplate, List[BaseTool]]:
|
2023-10-13 21:36:44 +00:00
|
|
|
if suffix is not None:
|
|
|
|
suffix_to_use = suffix
|
|
|
|
include_df_head = True
|
|
|
|
elif include_df_in_prompt:
|
|
|
|
suffix_to_use = SUFFIX_WITH_DF
|
|
|
|
include_df_head = True
|
|
|
|
else:
|
|
|
|
suffix_to_use = SUFFIX_NO_DF
|
|
|
|
include_df_head = False
|
|
|
|
|
|
|
|
if input_variables is None:
|
|
|
|
input_variables = ["input", "agent_scratchpad"]
|
|
|
|
if include_df_head:
|
|
|
|
input_variables += ["df_head"]
|
|
|
|
|
|
|
|
if prefix is None:
|
|
|
|
prefix = PREFIX
|
|
|
|
|
2023-12-05 04:54:08 +00:00
|
|
|
tools = [PythonAstREPLTool(locals={"df": df})] + list(extra_tools)
|
2023-10-13 21:36:44 +00:00
|
|
|
|
|
|
|
prompt = ZeroShotAgent.create_prompt(
|
2023-12-05 04:54:08 +00:00
|
|
|
tools,
|
|
|
|
prefix=prefix,
|
|
|
|
suffix=suffix_to_use,
|
|
|
|
input_variables=input_variables,
|
2023-10-13 21:36:44 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
partial_prompt = prompt.partial()
|
|
|
|
if "df_head" in input_variables:
|
|
|
|
partial_prompt = partial_prompt.partial(
|
|
|
|
df_head=str(df.head(number_of_head_rows).to_markdown())
|
|
|
|
)
|
|
|
|
return partial_prompt, tools
|
|
|
|
|
|
|
|
|
|
|
|
def _get_prompt_and_tools(
|
|
|
|
df: Any,
|
|
|
|
prefix: Optional[str] = None,
|
|
|
|
suffix: Optional[str] = None,
|
|
|
|
input_variables: Optional[List[str]] = None,
|
|
|
|
include_df_in_prompt: Optional[bool] = True,
|
|
|
|
number_of_head_rows: int = 5,
|
2023-12-05 04:54:08 +00:00
|
|
|
extra_tools: Sequence[BaseTool] = (),
|
|
|
|
) -> Tuple[BasePromptTemplate, List[BaseTool]]:
|
2023-10-13 21:36:44 +00:00
|
|
|
try:
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
pd.set_option("display.max_columns", None)
|
|
|
|
except ImportError:
|
|
|
|
raise ImportError(
|
|
|
|
"pandas package not found, please install with `pip install pandas`"
|
|
|
|
)
|
|
|
|
|
|
|
|
if include_df_in_prompt is not None and suffix is not None:
|
|
|
|
raise ValueError("If suffix is specified, include_df_in_prompt should not be.")
|
|
|
|
|
|
|
|
if isinstance(df, list):
|
|
|
|
for item in df:
|
|
|
|
if not isinstance(item, pd.DataFrame):
|
|
|
|
raise ValueError(f"Expected pandas object, got {type(df)}")
|
|
|
|
return _get_multi_prompt(
|
|
|
|
df,
|
|
|
|
prefix=prefix,
|
|
|
|
suffix=suffix,
|
|
|
|
input_variables=input_variables,
|
|
|
|
include_df_in_prompt=include_df_in_prompt,
|
|
|
|
number_of_head_rows=number_of_head_rows,
|
2023-12-05 04:54:08 +00:00
|
|
|
extra_tools=extra_tools,
|
2023-10-13 21:36:44 +00:00
|
|
|
)
|
|
|
|
else:
|
|
|
|
if not isinstance(df, pd.DataFrame):
|
|
|
|
raise ValueError(f"Expected pandas object, got {type(df)}")
|
|
|
|
return _get_single_prompt(
|
|
|
|
df,
|
|
|
|
prefix=prefix,
|
|
|
|
suffix=suffix,
|
|
|
|
input_variables=input_variables,
|
|
|
|
include_df_in_prompt=include_df_in_prompt,
|
|
|
|
number_of_head_rows=number_of_head_rows,
|
2023-12-05 04:54:08 +00:00
|
|
|
extra_tools=extra_tools,
|
2023-10-13 21:36:44 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def _get_functions_single_prompt(
|
|
|
|
df: Any,
|
|
|
|
prefix: Optional[str] = None,
|
|
|
|
suffix: Optional[str] = None,
|
|
|
|
include_df_in_prompt: Optional[bool] = True,
|
|
|
|
number_of_head_rows: int = 5,
|
|
|
|
) -> Tuple[BasePromptTemplate, List[PythonAstREPLTool]]:
|
|
|
|
if suffix is not None:
|
|
|
|
suffix_to_use = suffix
|
|
|
|
if include_df_in_prompt:
|
|
|
|
suffix_to_use = suffix_to_use.format(
|
|
|
|
df_head=str(df.head(number_of_head_rows).to_markdown())
|
|
|
|
)
|
|
|
|
elif include_df_in_prompt:
|
|
|
|
suffix_to_use = FUNCTIONS_WITH_DF.format(
|
|
|
|
df_head=str(df.head(number_of_head_rows).to_markdown())
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
suffix_to_use = ""
|
|
|
|
|
|
|
|
if prefix is None:
|
|
|
|
prefix = PREFIX_FUNCTIONS
|
|
|
|
|
|
|
|
tools = [PythonAstREPLTool(locals={"df": df})]
|
|
|
|
system_message = SystemMessage(content=prefix + suffix_to_use)
|
|
|
|
prompt = OpenAIFunctionsAgent.create_prompt(system_message=system_message)
|
|
|
|
return prompt, tools
|
|
|
|
|
|
|
|
|
|
|
|
def _get_functions_multi_prompt(
|
|
|
|
dfs: Any,
|
|
|
|
prefix: Optional[str] = None,
|
|
|
|
suffix: Optional[str] = None,
|
|
|
|
include_df_in_prompt: Optional[bool] = True,
|
|
|
|
number_of_head_rows: int = 5,
|
|
|
|
) -> Tuple[BasePromptTemplate, List[PythonAstREPLTool]]:
|
|
|
|
if suffix is not None:
|
|
|
|
suffix_to_use = suffix
|
|
|
|
if include_df_in_prompt:
|
|
|
|
dfs_head = "\n\n".join(
|
|
|
|
[d.head(number_of_head_rows).to_markdown() for d in dfs]
|
|
|
|
)
|
|
|
|
suffix_to_use = suffix_to_use.format(
|
|
|
|
dfs_head=dfs_head,
|
|
|
|
)
|
|
|
|
elif include_df_in_prompt:
|
|
|
|
dfs_head = "\n\n".join([d.head(number_of_head_rows).to_markdown() for d in dfs])
|
|
|
|
suffix_to_use = FUNCTIONS_WITH_MULTI_DF.format(
|
|
|
|
dfs_head=dfs_head,
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
suffix_to_use = ""
|
|
|
|
|
|
|
|
if prefix is None:
|
|
|
|
prefix = MULTI_DF_PREFIX_FUNCTIONS
|
|
|
|
prefix = prefix.format(num_dfs=str(len(dfs)))
|
|
|
|
|
|
|
|
df_locals = {}
|
|
|
|
for i, dataframe in enumerate(dfs):
|
|
|
|
df_locals[f"df{i + 1}"] = dataframe
|
|
|
|
tools = [PythonAstREPLTool(locals=df_locals)]
|
|
|
|
system_message = SystemMessage(content=prefix + suffix_to_use)
|
|
|
|
prompt = OpenAIFunctionsAgent.create_prompt(system_message=system_message)
|
|
|
|
return prompt, tools
|
|
|
|
|
|
|
|
|
|
|
|
def _get_functions_prompt_and_tools(
|
|
|
|
df: Any,
|
|
|
|
prefix: Optional[str] = None,
|
|
|
|
suffix: Optional[str] = None,
|
|
|
|
input_variables: Optional[List[str]] = None,
|
|
|
|
include_df_in_prompt: Optional[bool] = True,
|
|
|
|
number_of_head_rows: int = 5,
|
|
|
|
) -> Tuple[BasePromptTemplate, List[PythonAstREPLTool]]:
|
|
|
|
try:
|
|
|
|
import pandas as pd
|
|
|
|
|
|
|
|
pd.set_option("display.max_columns", None)
|
|
|
|
except ImportError:
|
|
|
|
raise ImportError(
|
|
|
|
"pandas package not found, please install with `pip install pandas`"
|
|
|
|
)
|
|
|
|
if input_variables is not None:
|
|
|
|
raise ValueError("`input_variables` is not supported at the moment.")
|
|
|
|
|
|
|
|
if include_df_in_prompt is not None and suffix is not None:
|
|
|
|
raise ValueError("If suffix is specified, include_df_in_prompt should not be.")
|
|
|
|
|
|
|
|
if isinstance(df, list):
|
|
|
|
for item in df:
|
|
|
|
if not isinstance(item, pd.DataFrame):
|
|
|
|
raise ValueError(f"Expected pandas object, got {type(df)}")
|
|
|
|
return _get_functions_multi_prompt(
|
|
|
|
df,
|
|
|
|
prefix=prefix,
|
|
|
|
suffix=suffix,
|
|
|
|
include_df_in_prompt=include_df_in_prompt,
|
|
|
|
number_of_head_rows=number_of_head_rows,
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
if not isinstance(df, pd.DataFrame):
|
|
|
|
raise ValueError(f"Expected pandas object, got {type(df)}")
|
|
|
|
return _get_functions_single_prompt(
|
|
|
|
df,
|
|
|
|
prefix=prefix,
|
|
|
|
suffix=suffix,
|
|
|
|
include_df_in_prompt=include_df_in_prompt,
|
|
|
|
number_of_head_rows=number_of_head_rows,
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def create_pandas_dataframe_agent(
|
|
|
|
llm: BaseLanguageModel,
|
|
|
|
df: Any,
|
|
|
|
agent_type: AgentType = AgentType.ZERO_SHOT_REACT_DESCRIPTION,
|
|
|
|
callback_manager: Optional[BaseCallbackManager] = None,
|
|
|
|
prefix: Optional[str] = None,
|
|
|
|
suffix: Optional[str] = None,
|
|
|
|
input_variables: Optional[List[str]] = None,
|
|
|
|
verbose: bool = False,
|
|
|
|
return_intermediate_steps: bool = False,
|
|
|
|
max_iterations: Optional[int] = 15,
|
|
|
|
max_execution_time: Optional[float] = None,
|
|
|
|
early_stopping_method: str = "force",
|
|
|
|
agent_executor_kwargs: Optional[Dict[str, Any]] = None,
|
|
|
|
include_df_in_prompt: Optional[bool] = True,
|
|
|
|
number_of_head_rows: int = 5,
|
|
|
|
extra_tools: Sequence[BaseTool] = (),
|
|
|
|
**kwargs: Dict[str, Any],
|
|
|
|
) -> AgentExecutor:
|
|
|
|
"""Construct a pandas agent from an LLM and dataframe."""
|
|
|
|
agent: BaseSingleActionAgent
|
2023-12-05 04:54:08 +00:00
|
|
|
base_tools: Sequence[BaseTool]
|
2023-10-13 21:36:44 +00:00
|
|
|
if agent_type == AgentType.ZERO_SHOT_REACT_DESCRIPTION:
|
|
|
|
prompt, base_tools = _get_prompt_and_tools(
|
|
|
|
df,
|
|
|
|
prefix=prefix,
|
|
|
|
suffix=suffix,
|
|
|
|
input_variables=input_variables,
|
|
|
|
include_df_in_prompt=include_df_in_prompt,
|
|
|
|
number_of_head_rows=number_of_head_rows,
|
2023-12-05 04:54:08 +00:00
|
|
|
extra_tools=extra_tools,
|
2023-10-13 21:36:44 +00:00
|
|
|
)
|
2023-12-05 04:54:08 +00:00
|
|
|
tools = base_tools
|
2023-10-13 21:36:44 +00:00
|
|
|
llm_chain = LLMChain(
|
|
|
|
llm=llm,
|
|
|
|
prompt=prompt,
|
|
|
|
callback_manager=callback_manager,
|
|
|
|
)
|
|
|
|
tool_names = [tool.name for tool in tools]
|
|
|
|
agent = ZeroShotAgent(
|
|
|
|
llm_chain=llm_chain,
|
|
|
|
allowed_tools=tool_names,
|
|
|
|
callback_manager=callback_manager,
|
|
|
|
**kwargs,
|
|
|
|
)
|
|
|
|
elif agent_type == AgentType.OPENAI_FUNCTIONS:
|
|
|
|
_prompt, base_tools = _get_functions_prompt_and_tools(
|
|
|
|
df,
|
|
|
|
prefix=prefix,
|
|
|
|
suffix=suffix,
|
|
|
|
input_variables=input_variables,
|
|
|
|
include_df_in_prompt=include_df_in_prompt,
|
|
|
|
number_of_head_rows=number_of_head_rows,
|
|
|
|
)
|
2023-12-05 04:54:08 +00:00
|
|
|
tools = list(base_tools) + list(extra_tools)
|
2023-10-13 21:36:44 +00:00
|
|
|
agent = OpenAIFunctionsAgent(
|
|
|
|
llm=llm,
|
|
|
|
prompt=_prompt,
|
|
|
|
tools=tools,
|
|
|
|
callback_manager=callback_manager,
|
|
|
|
**kwargs,
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
raise ValueError(f"Agent type {agent_type} not supported at the moment.")
|
|
|
|
return AgentExecutor.from_agent_and_tools(
|
|
|
|
agent=agent,
|
|
|
|
tools=tools,
|
|
|
|
callback_manager=callback_manager,
|
|
|
|
verbose=verbose,
|
|
|
|
return_intermediate_steps=return_intermediate_steps,
|
|
|
|
max_iterations=max_iterations,
|
|
|
|
max_execution_time=max_execution_time,
|
|
|
|
early_stopping_method=early_stopping_method,
|
|
|
|
**(agent_executor_kwargs or {}),
|
|
|
|
)
|