mirror of https://github.com/hwchase17/langchain
merge from upstream/master
commit
b78d672a43
@ -0,0 +1,2 @@
|
|||||||
|
position: 0
|
||||||
|
collapsed: false
|
@ -1,9 +0,0 @@
|
|||||||
---
|
|
||||||
sidebar_position: 3
|
|
||||||
---
|
|
||||||
|
|
||||||
# Web Scraping
|
|
||||||
|
|
||||||
Web scraping has historically been a challenging endeavor due to the ever-changing nature of website structures, making it tedious for developers to maintain their scraping scripts. Traditional methods often rely on specific HTML tags and patterns which, when altered, can disrupt data extraction processes.
|
|
||||||
|
|
||||||
Enter the LLM-based method for parsing HTML: By leveraging the capabilities of LLMs, and especially OpenAI Functions in LangChain's extraction chain, developers can instruct the model to extract only the desired data in a specified format. This method not only streamlines the extraction process but also significantly reduces the time spent on manual debugging and script modifications. Its adaptability means that even if websites undergo significant design changes, the extraction remains consistent and robust. This level of resilience translates to reduced maintenance efforts, cost savings, and ensures a higher quality of extracted data. Compared to its predecessors, the LLM-based approach wins out in the web scraping domain by transforming a historically cumbersome task into a more automated and efficient process.
|
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,203 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "e89f490d",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Agents\n",
|
||||||
|
"\n",
|
||||||
|
"You can pass a Runnable into an agent."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "af4381de",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.agents import XMLAgent, tool, AgentExecutor\n",
|
||||||
|
"from langchain.chat_models import ChatAnthropic"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "24cc8134",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"model = ChatAnthropic(model=\"claude-2\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "67c0b0e4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"@tool\n",
|
||||||
|
"def search(query: str) -> str:\n",
|
||||||
|
" \"\"\"Search things about current events.\"\"\"\n",
|
||||||
|
" return \"32 degrees\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "7203b101",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"tool_list = [search]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "b68e756d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Get prompt to use\n",
|
||||||
|
"prompt = XMLAgent.get_default_prompt()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "61ab3e9a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Logic for going from intermediate steps to a string to pass into model\n",
|
||||||
|
"# This is pretty tied to the prompt\n",
|
||||||
|
"def convert_intermediate_steps(intermediate_steps):\n",
|
||||||
|
" log = \"\"\n",
|
||||||
|
" for action, observation in intermediate_steps:\n",
|
||||||
|
" log += (\n",
|
||||||
|
" f\"<tool>{action.tool}</tool><tool_input>{action.tool_input}\"\n",
|
||||||
|
" f\"</tool_input><observation>{observation}</observation>\"\n",
|
||||||
|
" )\n",
|
||||||
|
" return log\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"# Logic for converting tools to string to go in prompt\n",
|
||||||
|
"def convert_tools(tools):\n",
|
||||||
|
" return \"\\n\".join([f\"{tool.name}: {tool.description}\" for tool in tools])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "260f5988",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Building an agent from a runnable usually involves a few things:\n",
|
||||||
|
"\n",
|
||||||
|
"1. Data processing for the intermediate steps. These need to represented in a way that the language model can recognize them. This should be pretty tightly coupled to the instructions in the prompt\n",
|
||||||
|
"\n",
|
||||||
|
"2. The prompt itself\n",
|
||||||
|
"\n",
|
||||||
|
"3. The model, complete with stop tokens if needed\n",
|
||||||
|
"\n",
|
||||||
|
"4. The output parser - should be in sync with how the prompt specifies things to be formatted."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "e92f1d6f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"agent = (\n",
|
||||||
|
" {\n",
|
||||||
|
" \"question\": lambda x: x[\"question\"],\n",
|
||||||
|
" \"intermediate_steps\": lambda x: convert_intermediate_steps(x[\"intermediate_steps\"])\n",
|
||||||
|
" }\n",
|
||||||
|
" | prompt.partial(tools=convert_tools(tool_list))\n",
|
||||||
|
" | model.bind(stop=[\"</tool_input>\", \"</final_answer>\"])\n",
|
||||||
|
" | XMLAgent.get_default_output_parser()\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "6ce6ec7a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"agent_executor = AgentExecutor(agent=agent, tools=tool_list, verbose=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "fb5cb2e3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
|
||||||
|
"\u001b[32;1m\u001b[1;3m <tool>search</tool>\n",
|
||||||
|
"<tool_input>weather in new york\u001b[0m\u001b[36;1m\u001b[1;3m32 degrees\u001b[0m\u001b[32;1m\u001b[1;3m\n",
|
||||||
|
"\n",
|
||||||
|
"<final_answer>The weather in New York is 32 degrees\u001b[0m\n",
|
||||||
|
"\n",
|
||||||
|
"\u001b[1m> Finished chain.\u001b[0m\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'question': 'whats the weather in New york?',\n",
|
||||||
|
" 'output': 'The weather in New York is 32 degrees'}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"agent_executor.invoke({\"question\": \"whats the weather in New york?\"})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "bce86dd8",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,119 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "f09fd305",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Code writing\n",
|
||||||
|
"\n",
|
||||||
|
"Example of how to use LCEL to write Python code."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "bd7c259a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate\n",
|
||||||
|
"from langchain.schema.output_parser import StrOutputParser\n",
|
||||||
|
"from langchain.utilities import PythonREPL"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "73795d2d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"template = \"\"\"Write some python code to solve the user's problem. \n",
|
||||||
|
"\n",
|
||||||
|
"Return only python code in Markdown format, e.g.:\n",
|
||||||
|
"\n",
|
||||||
|
"```python\n",
|
||||||
|
"....\n",
|
||||||
|
"```\"\"\"\n",
|
||||||
|
"prompt = ChatPromptTemplate.from_messages(\n",
|
||||||
|
" [(\"system\", template), (\"human\", \"{input}\")]\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"model = ChatOpenAI()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"id": "42859e8a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def _sanitize_output(text: str):\n",
|
||||||
|
" _, after = text.split(\"```python\")\n",
|
||||||
|
" return after.split(\"```\")[0]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"id": "5ded1a86",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"chain = prompt | model | StrOutputParser() | _sanitize_output | PythonREPL().run"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"id": "208c2b75",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Python REPL can execute arbitrary code. Use with caution.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'4\\n'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain.invoke({\"input\": \"whats 2 plus 2\"})"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,11 @@
|
|||||||
|
---
|
||||||
|
sidebar_position: 2
|
||||||
|
---
|
||||||
|
|
||||||
|
# Cookbook
|
||||||
|
|
||||||
|
import DocCardList from "@theme/DocCardList";
|
||||||
|
|
||||||
|
Example code for accomplishing common tasks with the LangChain Expression Language (LCEL). These examples show how to compose different Runnable (the core LCEL interface) components to achieve various tasks. If you're just getting acquainted with LCEL, the [Prompt + LLM](/docs/expression_language/cookbook/prompt_llm_parser) page is a good place to start.
|
||||||
|
|
||||||
|
<DocCardList />
|
@ -0,0 +1,180 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "5062941a",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Adding memory\n",
|
||||||
|
"\n",
|
||||||
|
"This shows how to add memory to an arbitrary chain. Right now, you can use the memory classes but need to hook it up manually"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "7998efd8",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"from langchain.memory import ConversationBufferMemory\n",
|
||||||
|
"from langchain.schema.runnable import RunnableMap\n",
|
||||||
|
"from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
|
||||||
|
"\n",
|
||||||
|
"model = ChatOpenAI()\n",
|
||||||
|
"prompt = ChatPromptTemplate.from_messages([\n",
|
||||||
|
" (\"system\", \"You are a helpful chatbot\"),\n",
|
||||||
|
" MessagesPlaceholder(variable_name=\"history\"),\n",
|
||||||
|
" (\"human\", \"{input}\")\n",
|
||||||
|
"])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "fa0087f3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"memory = ConversationBufferMemory(return_messages=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "06b531ae",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'history': []}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"memory.load_memory_variables({})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "d9437af6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"chain = RunnableMap({\n",
|
||||||
|
" \"input\": lambda x: x[\"input\"],\n",
|
||||||
|
" \"memory\": memory.load_memory_variables\n",
|
||||||
|
"}) | {\n",
|
||||||
|
" \"input\": lambda x: x[\"input\"],\n",
|
||||||
|
" \"history\": lambda x: x[\"memory\"][\"history\"]\n",
|
||||||
|
"} | prompt | model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "bed1e260",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"AIMessage(content='Hello Bob! How can I assist you today?', additional_kwargs={}, example=False)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"inputs = {\"input\": \"hi im bob\"}\n",
|
||||||
|
"response = chain.invoke(inputs)\n",
|
||||||
|
"response"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "890475b4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"memory.save_context(inputs, {\"output\": response.content})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "e8fcb77f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'history': [HumanMessage(content='hi im bob', additional_kwargs={}, example=False),\n",
|
||||||
|
" AIMessage(content='Hello Bob! How can I assist you today?', additional_kwargs={}, example=False)]}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"memory.load_memory_variables({})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "d837d5c3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"AIMessage(content='Your name is Bob.', additional_kwargs={}, example=False)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"inputs = {\"input\": \"whats my name\"}\n",
|
||||||
|
"response = chain.invoke(inputs)\n",
|
||||||
|
"response"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,133 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "4927a727-b4c8-453c-8c83-bd87b4fcac14",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Adding moderation\n",
|
||||||
|
"\n",
|
||||||
|
"This shows how to add in moderation (or other safeguards) around your LLM application."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"id": "4f5f6449-940a-4f5c-97c0-39b71c3e2a68",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chains import OpenAIModerationChain\n",
|
||||||
|
"from langchain.llms import OpenAI\n",
|
||||||
|
"from langchain.prompts import ChatPromptTemplate"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "fcb8312b-7e7a-424f-a3ec-76738c9a9d21",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"moderate = OpenAIModerationChain()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"id": "b24b9148-f6b0-4091-8ea8-d3fb281bd950",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"model = OpenAI()\n",
|
||||||
|
"prompt = ChatPromptTemplate.from_messages([\n",
|
||||||
|
" (\"system\", \"repeat after me: {input}\")\n",
|
||||||
|
"])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"id": "1c8ed87c-9ca6-4559-bf60-d40e94a0af08",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"chain = prompt | model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 23,
|
||||||
|
"id": "5256b9bd-381a-42b0-bfa8-7e6d18f853cb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'\\n\\nYou are stupid.'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 23,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain.invoke({\"input\": \"you are stupid\"})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 24,
|
||||||
|
"id": "fe6e3b33-dc9a-49d5-b194-ba750c58a628",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"moderated_chain = chain | moderate"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 25,
|
||||||
|
"id": "d8ba0cbd-c739-4d23-be9f-6ae092bd5ffb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'input': '\\n\\nYou are stupid',\n",
|
||||||
|
" 'output': \"Text was found that violates OpenAI's content policy.\"}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 25,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"moderated_chain.invoke({\"input\": \"you are stupid\"})"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,240 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "raw",
|
||||||
|
"id": "877102d1-02ea-4fa3-8ec7-a08e242b95b3",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"---\n",
|
||||||
|
"sidebar_position: 2\n",
|
||||||
|
"title: Multiple chains\n",
|
||||||
|
"---"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "0f2bf8d3",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Runnables can easily be used to string together multiple Chains"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "d65d4e9e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'El país donde se encuentra la ciudad de Honolulu, donde nació Barack Obama, el 44º Presidente de los Estados Unidos, es Estados Unidos. Honolulu se encuentra en la isla de Oahu, en el estado de Hawái.'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from operator import itemgetter\n",
|
||||||
|
"\n",
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"from langchain.prompts import ChatPromptTemplate\n",
|
||||||
|
"from langchain.schema import StrOutputParser\n",
|
||||||
|
"\n",
|
||||||
|
"prompt1 = ChatPromptTemplate.from_template(\"what is the city {person} is from?\")\n",
|
||||||
|
"prompt2 = ChatPromptTemplate.from_template(\"what country is the city {city} in? respond in {language}\")\n",
|
||||||
|
"\n",
|
||||||
|
"model = ChatOpenAI()\n",
|
||||||
|
"\n",
|
||||||
|
"chain1 = prompt1 | model | StrOutputParser()\n",
|
||||||
|
"\n",
|
||||||
|
"chain2 = {\"city\": chain1, \"language\": itemgetter(\"language\")} | prompt2 | model | StrOutputParser()\n",
|
||||||
|
"\n",
|
||||||
|
"chain2.invoke({\"person\": \"obama\", \"language\": \"spanish\"})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "878f8176",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.schema.runnable import RunnableMap, RunnablePassthrough\n",
|
||||||
|
"\n",
|
||||||
|
"prompt1 = ChatPromptTemplate.from_template(\"generate a {attribute} color. Return the name of the color and nothing else:\")\n",
|
||||||
|
"prompt2 = ChatPromptTemplate.from_template(\"what is a fruit of color: {color}. Return the name of the fruit and nothing else:\")\n",
|
||||||
|
"prompt3 = ChatPromptTemplate.from_template(\"what is a country with a flag that has the color: {color}. Return the name of the country and nothing else:\")\n",
|
||||||
|
"prompt4 = ChatPromptTemplate.from_template(\"What is the color of {fruit} and the flag of {country}?\")\n",
|
||||||
|
"\n",
|
||||||
|
"model_parser = model | StrOutputParser()\n",
|
||||||
|
"\n",
|
||||||
|
"color_generator = {\"attribute\": RunnablePassthrough()} | prompt1 | {\"color\": model_parser}\n",
|
||||||
|
"color_to_fruit = prompt2 | model_parser\n",
|
||||||
|
"color_to_country = prompt3 | model_parser\n",
|
||||||
|
"question_generator = color_generator | {\"fruit\": color_to_fruit, \"country\": color_to_country} | prompt4"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "d621a870",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"ChatPromptValue(messages=[HumanMessage(content='What is the color of strawberry and the flag of China?', additional_kwargs={}, example=False)])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"question_generator.invoke({\"warm\"})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "b4a9812b-bead-4fd9-ae27-0b8be57e5dc1",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"AIMessage(content='The color of an apple is typically red or green. The flag of China is predominantly red with a large yellow star in the upper left corner and four smaller yellow stars surrounding it.', additional_kwargs={}, example=False)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"prompt = question_generator.invoke({\"warm\"})\n",
|
||||||
|
"model.invoke(prompt)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "6d75a313-f1c8-4e94-9a17-24e0bf4a2bdc",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Branching and Merging\n",
|
||||||
|
"\n",
|
||||||
|
"You may want the output of one component to be processed by 2 or more other components. [RunnableMaps](https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.base.RunnableMap.html) let you split or fork the chain so multiple components can process the input in parallel. Later, other components can join or merge the results to synthesize a final response. This type of chain creates a computation graph that looks like the following:\n",
|
||||||
|
"\n",
|
||||||
|
"```text\n",
|
||||||
|
" Input\n",
|
||||||
|
" / \\\n",
|
||||||
|
" / \\\n",
|
||||||
|
" Branch1 Branch2\n",
|
||||||
|
" \\ /\n",
|
||||||
|
" \\ /\n",
|
||||||
|
" Combine\n",
|
||||||
|
"```"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "247fa0bd-4596-4063-8cb3-1d7fc119d982",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"planner = (\n",
|
||||||
|
" ChatPromptTemplate.from_template(\n",
|
||||||
|
" \"Generate an argument about: {input}\"\n",
|
||||||
|
" )\n",
|
||||||
|
" | ChatOpenAI()\n",
|
||||||
|
" | StrOutputParser()\n",
|
||||||
|
" | {\"base_response\": RunnablePassthrough()}\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"arguments_for = (\n",
|
||||||
|
" ChatPromptTemplate.from_template(\n",
|
||||||
|
" \"List the pros or positive aspects of {base_response}\"\n",
|
||||||
|
" )\n",
|
||||||
|
" | ChatOpenAI()\n",
|
||||||
|
" | StrOutputParser()\n",
|
||||||
|
")\n",
|
||||||
|
"arguments_against = (\n",
|
||||||
|
" ChatPromptTemplate.from_template(\n",
|
||||||
|
" \"List the cons or negative aspects of {base_response}\"\n",
|
||||||
|
" )\n",
|
||||||
|
" | ChatOpenAI()\n",
|
||||||
|
" | StrOutputParser()\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"final_responder = (\n",
|
||||||
|
" ChatPromptTemplate.from_messages(\n",
|
||||||
|
" [\n",
|
||||||
|
" (\"ai\", \"{original_response}\"),\n",
|
||||||
|
" (\"human\", \"Pros:\\n{results_1}\\n\\nCons:\\n{results_2}\"),\n",
|
||||||
|
" (\"system\", \"Generate a final response given the critique\"),\n",
|
||||||
|
" ]\n",
|
||||||
|
" )\n",
|
||||||
|
" | ChatOpenAI()\n",
|
||||||
|
" | StrOutputParser()\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"chain = (\n",
|
||||||
|
" planner \n",
|
||||||
|
" | {\n",
|
||||||
|
" \"results_1\": arguments_for,\n",
|
||||||
|
" \"results_2\": arguments_against,\n",
|
||||||
|
" \"original_response\": itemgetter(\"base_response\"),\n",
|
||||||
|
" }\n",
|
||||||
|
" | final_responder\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "2564f310-0674-4bb1-9c4e-d7848ca73511",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'While Scrum has its potential cons and challenges, many organizations have successfully embraced and implemented this project management framework to great effect. The cons mentioned above can be mitigated or overcome with proper training, support, and a commitment to continuous improvement. It is also important to note that not all cons may be applicable to every organization or project.\\n\\nFor example, while Scrum may be complex initially, with proper training and guidance, teams can quickly grasp the concepts and practices. The lack of predictability can be mitigated by implementing techniques such as velocity tracking and release planning. The limited documentation can be addressed by maintaining a balance between lightweight documentation and clear communication among team members. The dependency on team collaboration can be improved through effective communication channels and regular team-building activities.\\n\\nScrum can be scaled and adapted to larger projects by using frameworks like Scrum of Scrums or LeSS (Large Scale Scrum). Concerns about speed versus quality can be addressed by incorporating quality assurance practices, such as continuous integration and automated testing, into the Scrum process. Scope creep can be managed by having a well-defined and prioritized product backlog, and a strong product owner can be developed through training and mentorship.\\n\\nResistance to change can be overcome by providing proper education and communication to stakeholders and involving them in the decision-making process. Ultimately, the cons of Scrum can be seen as opportunities for growth and improvement, and with the right mindset and support, they can be effectively managed.\\n\\nIn conclusion, while Scrum may have its challenges and potential cons, the benefits and advantages it offers in terms of collaboration, flexibility, adaptability, transparency, and customer satisfaction make it a widely adopted and successful project management framework. With proper implementation and continuous improvement, organizations can leverage Scrum to drive innovation, efficiency, and project success.'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain.invoke({\"input\": \"scrum\"})"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "poetry-venv",
|
||||||
|
"language": "python",
|
||||||
|
"name": "poetry-venv"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,431 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "raw",
|
||||||
|
"id": "abf7263d-3a62-4016-b5d5-b157f92f2070",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"---\n",
|
||||||
|
"sidebar_position: 0\n",
|
||||||
|
"title: Prompt + LLM\n",
|
||||||
|
"---"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "9a434f2b-9405-468c-9dfd-254d456b57a6",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The most common and valuable composition is taking:\n",
|
||||||
|
"\n",
|
||||||
|
"``PromptTemplate`` / ``ChatPromptTemplate`` -> ``LLM`` / ``ChatModel`` -> ``OutputParser``\n",
|
||||||
|
"\n",
|
||||||
|
"Almost any other chains you build will use this building block."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "93aa2c87",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## PromptTemplate + LLM\n",
|
||||||
|
"\n",
|
||||||
|
"The simplest composition is just combing a prompt and model to create a chain that takes user input, adds it to a prompt, passes it to a model, and returns the raw model input.\n",
|
||||||
|
"\n",
|
||||||
|
"Note, you can mix and match PromptTemplate/ChatPromptTemplates and LLMs/ChatModels as you like here."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "466b65b3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.prompts import ChatPromptTemplate\n",
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"\n",
|
||||||
|
"prompt = ChatPromptTemplate.from_template(\"tell me a joke about {foo}\")\n",
|
||||||
|
"model = ChatOpenAI()\n",
|
||||||
|
"chain = prompt | model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "e3d0a6cd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"AIMessage(content=\"Why don't bears wear shoes?\\n\\nBecause they have bear feet!\", additional_kwargs={}, example=False)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain.invoke({\"foo\": \"bears\"})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "7eb9ef50",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Often times we want to attach kwargs that'll be passed to each model call. Here's a few examples of that:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "0b1d8f88",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Attaching Stop Sequences"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "562a06bf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"chain = prompt | model.bind(stop=[\"\\n\"])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "43f5d04c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"AIMessage(content='Why did the bear never wear shoes?', additional_kwargs={}, example=False)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain.invoke({\"foo\": \"bears\"})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "f3eaf88a",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Attaching Function Call information"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "f94b71b2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"functions = [\n",
|
||||||
|
" {\n",
|
||||||
|
" \"name\": \"joke\",\n",
|
||||||
|
" \"description\": \"A joke\",\n",
|
||||||
|
" \"parameters\": {\n",
|
||||||
|
" \"type\": \"object\",\n",
|
||||||
|
" \"properties\": {\n",
|
||||||
|
" \"setup\": {\n",
|
||||||
|
" \"type\": \"string\",\n",
|
||||||
|
" \"description\": \"The setup for the joke\"\n",
|
||||||
|
" },\n",
|
||||||
|
" \"punchline\": {\n",
|
||||||
|
" \"type\": \"string\",\n",
|
||||||
|
" \"description\": \"The punchline for the joke\"\n",
|
||||||
|
" }\n",
|
||||||
|
" },\n",
|
||||||
|
" \"required\": [\"setup\", \"punchline\"]\n",
|
||||||
|
" }\n",
|
||||||
|
" }\n",
|
||||||
|
" ]\n",
|
||||||
|
"chain = prompt | model.bind(function_call= {\"name\": \"joke\"}, functions= functions)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "decf7710",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"AIMessage(content='', additional_kwargs={'function_call': {'name': 'joke', 'arguments': '{\\n \"setup\": \"Why don\\'t bears wear shoes?\",\\n \"punchline\": \"Because they have bear feet!\"\\n}'}}, example=False)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain.invoke({\"foo\": \"bears\"}, config={})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "9098c5ed",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## PromptTemplate + LLM + OutputParser\n",
|
||||||
|
"\n",
|
||||||
|
"We can also add in an output parser to easily trasform the raw LLM/ChatModel output into a more workable format"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "cc194c78",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.schema.output_parser import StrOutputParser\n",
|
||||||
|
"\n",
|
||||||
|
"chain = prompt | model | StrOutputParser()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "77acf448",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Notice that this now returns a string - a much more workable format for downstream tasks"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "e3d69a18",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"\"Why don't bears wear shoes?\\n\\nBecause they have bear feet!\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain.invoke({\"foo\": \"bears\"})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "c01864e5",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Functions Output Parser\n",
|
||||||
|
"\n",
|
||||||
|
"When you specify the function to return, you may just want to parse that directly"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "ad0dd88e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser\n",
|
||||||
|
"\n",
|
||||||
|
"chain = (\n",
|
||||||
|
" prompt \n",
|
||||||
|
" | model.bind(function_call= {\"name\": \"joke\"}, functions= functions) \n",
|
||||||
|
" | JsonOutputFunctionsParser()\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "1e7aa8eb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'setup': \"Why don't bears like fast food?\",\n",
|
||||||
|
" 'punchline': \"Because they can't catch it!\"}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain.invoke({\"foo\": \"bears\"})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "d4aa1a01",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser\n",
|
||||||
|
"\n",
|
||||||
|
"chain = (\n",
|
||||||
|
" prompt \n",
|
||||||
|
" | model.bind(function_call= {\"name\": \"joke\"}, functions= functions) \n",
|
||||||
|
" | JsonKeyOutputFunctionsParser(key_name=\"setup\")\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "8b6df9ba",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"\"Why don't bears wear shoes?\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain.invoke({\"foo\": \"bears\"})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "023fbccb-ef7d-489e-a9ba-f98e17283d51",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Simplifying input\n",
|
||||||
|
"\n",
|
||||||
|
"To make invocation even simpler, we can add a `RunnableMap` to take care of creating the prompt input dict for us:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"id": "9601c0f0-71f9-4bd4-a672-7bd04084b018",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.schema.runnable import RunnableMap, RunnablePassthrough\n",
|
||||||
|
"\n",
|
||||||
|
"map_ = RunnableMap({\"foo\": RunnablePassthrough()})\n",
|
||||||
|
"chain = (\n",
|
||||||
|
" map_ \n",
|
||||||
|
" | prompt\n",
|
||||||
|
" | model.bind(function_call= {\"name\": \"joke\"}, functions= functions) \n",
|
||||||
|
" | JsonKeyOutputFunctionsParser(key_name=\"setup\")\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"id": "7ec4f154-fda5-4847-9220-41aa902fdc33",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"\"Why don't bears wear shoes?\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain.invoke(\"bears\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "def00bfe-0f83-4805-8c8f-8a53f99fa8ea",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Since we're composing our map with another Runnable, we can even use some syntactic sugar and just use a dict:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"id": "7bf3846a-02ee-41a3-ba1b-a708827d4f3a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"chain = (\n",
|
||||||
|
" {\"foo\": RunnablePassthrough()} \n",
|
||||||
|
" | prompt\n",
|
||||||
|
" | model.bind(function_call= {\"name\": \"joke\"}, functions= functions) \n",
|
||||||
|
" | JsonKeyOutputFunctionsParser(key_name=\"setup\")\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"id": "e566d6a1-538d-4cb5-a210-a63e082e4c74",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"\"Why don't bears like fast food?\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 22,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain.invoke(\"bears\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,461 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "raw",
|
||||||
|
"id": "abe47592-909c-4844-bf44-9e55c2fb4bfa",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"---\n",
|
||||||
|
"sidebar_position: 1\n",
|
||||||
|
"title: RAG\n",
|
||||||
|
"---"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "91c5ef3d",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Let's look at adding in a retrieval step to a prompt and LLM, which adds up to a \"retrieval-augmented generation\" chain"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "7f25d9e9-d192-42e9-af50-5660a4bfb0d9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"!pip install langchain openai faiss-cpu"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "33be32af",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from operator import itemgetter\n",
|
||||||
|
"\n",
|
||||||
|
"from langchain.prompts import ChatPromptTemplate\n",
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"from langchain.embeddings import OpenAIEmbeddings\n",
|
||||||
|
"from langchain.schema.output_parser import StrOutputParser\n",
|
||||||
|
"from langchain.schema.runnable import RunnablePassthrough\n",
|
||||||
|
"from langchain.vectorstores import FAISS"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "bfc47ec1",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"vectorstore = FAISS.from_texts([\"harrison worked at kensho\"], embedding=OpenAIEmbeddings())\n",
|
||||||
|
"retriever = vectorstore.as_retriever()\n",
|
||||||
|
"\n",
|
||||||
|
"template = \"\"\"Answer the question based only on the following context:\n",
|
||||||
|
"{context}\n",
|
||||||
|
"\n",
|
||||||
|
"Question: {question}\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"prompt = ChatPromptTemplate.from_template(template)\n",
|
||||||
|
"\n",
|
||||||
|
"model = ChatOpenAI()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "eae31755",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"chain = (\n",
|
||||||
|
" {\"context\": retriever, \"question\": RunnablePassthrough()} \n",
|
||||||
|
" | prompt \n",
|
||||||
|
" | model \n",
|
||||||
|
" | StrOutputParser()\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "f3040b0c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'Harrison worked at Kensho.'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain.invoke(\"where did harrison work?\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "e1d20c7c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"template = \"\"\"Answer the question based only on the following context:\n",
|
||||||
|
"{context}\n",
|
||||||
|
"\n",
|
||||||
|
"Question: {question}\n",
|
||||||
|
"\n",
|
||||||
|
"Answer in the following language: {language}\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"prompt = ChatPromptTemplate.from_template(template)\n",
|
||||||
|
"\n",
|
||||||
|
"chain = {\n",
|
||||||
|
" \"context\": itemgetter(\"question\") | retriever, \n",
|
||||||
|
" \"question\": itemgetter(\"question\"), \n",
|
||||||
|
" \"language\": itemgetter(\"language\")\n",
|
||||||
|
"} | prompt | model | StrOutputParser()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "7ee8b2d4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'Harrison ha lavorato a Kensho.'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain.invoke({\"question\": \"where did harrison work\", \"language\": \"italian\"})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "f007669c",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Conversational Retrieval Chain\n",
|
||||||
|
"\n",
|
||||||
|
"We can easily add in conversation history. This primarily means adding in chat_message_history"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "3f30c348",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.schema.runnable import RunnableMap\n",
|
||||||
|
"from langchain.schema import format_document"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "64ab1dbf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.prompts.prompt import PromptTemplate\n",
|
||||||
|
"\n",
|
||||||
|
"_template = \"\"\"Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n",
|
||||||
|
"\n",
|
||||||
|
"Chat History:\n",
|
||||||
|
"{chat_history}\n",
|
||||||
|
"Follow Up Input: {question}\n",
|
||||||
|
"Standalone question:\"\"\"\n",
|
||||||
|
"CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "7d628c97",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"template = \"\"\"Answer the question based only on the following context:\n",
|
||||||
|
"{context}\n",
|
||||||
|
"\n",
|
||||||
|
"Question: {question}\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"ANSWER_PROMPT = ChatPromptTemplate.from_template(template)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "f60a5d0f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template=\"{page_content}\")\n",
|
||||||
|
"def _combine_documents(docs, document_prompt = DEFAULT_DOCUMENT_PROMPT, document_separator=\"\\n\\n\"):\n",
|
||||||
|
" doc_strings = [format_document(doc, document_prompt) for doc in docs]\n",
|
||||||
|
" return document_separator.join(doc_strings)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "7d007db6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from typing import Tuple, List\n",
|
||||||
|
"def _format_chat_history(chat_history: List[Tuple]) -> str:\n",
|
||||||
|
" buffer = \"\"\n",
|
||||||
|
" for dialogue_turn in chat_history:\n",
|
||||||
|
" human = \"Human: \" + dialogue_turn[0]\n",
|
||||||
|
" ai = \"Assistant: \" + dialogue_turn[1]\n",
|
||||||
|
" buffer += \"\\n\" + \"\\n\".join([human, ai])\n",
|
||||||
|
" return buffer"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"id": "5c32cc89",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"_inputs = RunnableMap(\n",
|
||||||
|
" {\n",
|
||||||
|
" \"standalone_question\": {\n",
|
||||||
|
" \"question\": lambda x: x[\"question\"],\n",
|
||||||
|
" \"chat_history\": lambda x: _format_chat_history(x['chat_history'])\n",
|
||||||
|
" } | CONDENSE_QUESTION_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),\n",
|
||||||
|
" }\n",
|
||||||
|
")\n",
|
||||||
|
"_context = {\n",
|
||||||
|
" \"context\": itemgetter(\"standalone_question\") | retriever | _combine_documents,\n",
|
||||||
|
" \"question\": lambda x: x[\"standalone_question\"]\n",
|
||||||
|
"}\n",
|
||||||
|
"conversational_qa_chain = _inputs | _context | ANSWER_PROMPT | ChatOpenAI()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"id": "135c8205",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"conversational_qa_chain.invoke({\n",
|
||||||
|
" \"question\": \"where did harrison work?\",\n",
|
||||||
|
" \"chat_history\": [],\n",
|
||||||
|
"})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"id": "424e7e7a",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"AIMessage(content='Harrison worked at Kensho.', additional_kwargs={}, example=False)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"conversational_qa_chain.invoke({\n",
|
||||||
|
" \"question\": \"where did he work?\",\n",
|
||||||
|
" \"chat_history\": [(\"Who wrote this notebook?\", \"Harrison\")],\n",
|
||||||
|
"})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "c5543183",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### With Memory and returning source documents\n",
|
||||||
|
"\n",
|
||||||
|
"This shows how to use memory with the above. For memory, we need to manage that outside at the memory. For returning the retrieved documents, we just need to pass them through all the way."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"id": "e31dd17c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.memory import ConversationBufferMemory"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 17,
|
||||||
|
"id": "d4bffe94",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"memory = ConversationBufferMemory(return_messages=True, output_key=\"answer\", input_key=\"question\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 18,
|
||||||
|
"id": "733be985",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# First we add a step to load memory\n",
|
||||||
|
"# This needs to be a RunnableMap because its the first input\n",
|
||||||
|
"loaded_memory = RunnableMap(\n",
|
||||||
|
" {\n",
|
||||||
|
" \"question\": itemgetter(\"question\"),\n",
|
||||||
|
" \"memory\": memory.load_memory_variables,\n",
|
||||||
|
" }\n",
|
||||||
|
")\n",
|
||||||
|
"# Next we add a step to expand memory into the variables\n",
|
||||||
|
"expanded_memory = {\n",
|
||||||
|
" \"question\": itemgetter(\"question\"),\n",
|
||||||
|
" \"chat_history\": lambda x: x[\"memory\"][\"history\"]\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"# Now we calculate the standalone question\n",
|
||||||
|
"standalone_question = {\n",
|
||||||
|
" \"standalone_question\": {\n",
|
||||||
|
" \"question\": lambda x: x[\"question\"],\n",
|
||||||
|
" \"chat_history\": lambda x: _format_chat_history(x['chat_history'])\n",
|
||||||
|
" } | CONDENSE_QUESTION_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),\n",
|
||||||
|
"}\n",
|
||||||
|
"# Now we retrieve the documents\n",
|
||||||
|
"retrieved_documents = {\n",
|
||||||
|
" \"docs\": itemgetter(\"standalone_question\") | retriever,\n",
|
||||||
|
" \"question\": lambda x: x[\"standalone_question\"]\n",
|
||||||
|
"}\n",
|
||||||
|
"# Now we construct the inputs for the final prompt\n",
|
||||||
|
"final_inputs = {\n",
|
||||||
|
" \"context\": lambda x: _combine_documents(x[\"docs\"]),\n",
|
||||||
|
" \"question\": itemgetter(\"question\")\n",
|
||||||
|
"}\n",
|
||||||
|
"# And finally, we do the part that returns the answers\n",
|
||||||
|
"answer = {\n",
|
||||||
|
" \"answer\": final_inputs | ANSWER_PROMPT | ChatOpenAI(),\n",
|
||||||
|
" \"docs\": itemgetter(\"docs\"),\n",
|
||||||
|
"}\n",
|
||||||
|
"# And now we put it all together!\n",
|
||||||
|
"final_chain = loaded_memory | expanded_memory | standalone_question | retrieved_documents | answer"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 19,
|
||||||
|
"id": "806e390c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'answer': AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False),\n",
|
||||||
|
" 'docs': [Document(page_content='harrison worked at kensho', metadata={})]}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 19,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"inputs = {\"question\": \"where did harrison work?\"}\n",
|
||||||
|
"result = final_chain.invoke(inputs)\n",
|
||||||
|
"result"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"id": "977399fd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Note that the memory does not save automatically\n",
|
||||||
|
"# This will be improved in the future\n",
|
||||||
|
"# For now you need to save it yourself\n",
|
||||||
|
"memory.save_context(inputs, {\"answer\": result[\"answer\"].content})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"id": "f94f7de4",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'history': [HumanMessage(content='where did harrison work?', additional_kwargs={}, example=False),\n",
|
||||||
|
" AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False)]}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 21,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"memory.load_memory_variables({})"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "poetry-venv",
|
||||||
|
"language": "python",
|
||||||
|
"name": "poetry-venv"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,227 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "raw",
|
||||||
|
"id": "c14da114-1a4a-487d-9cff-e0e8c30ba366",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"---\n",
|
||||||
|
"sidebar_position: 3\n",
|
||||||
|
"title: Querying a SQL DB\n",
|
||||||
|
"---"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "506e9636",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We can replicate our SQLDatabaseChain with Runnables."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "7a927516",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.prompts import ChatPromptTemplate\n",
|
||||||
|
"\n",
|
||||||
|
"template = \"\"\"Based on the table schema below, write a SQL query that would answer the user's question:\n",
|
||||||
|
"{schema}\n",
|
||||||
|
"\n",
|
||||||
|
"Question: {question}\n",
|
||||||
|
"SQL Query:\"\"\"\n",
|
||||||
|
"prompt = ChatPromptTemplate.from_template(template)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "3f51f386",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.utilities import SQLDatabase"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "7c3449d6-684b-416e-ba16-90a035835a88",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We'll need the Chinook sample DB for this example. There's many places to download it from, e.g. https://database.guide/2-sample-databases-sqlite/"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 20,
|
||||||
|
"id": "2ccca6fc",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"db = SQLDatabase.from_uri(\"sqlite:///./Chinook.db\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 21,
|
||||||
|
"id": "05ba88ee",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def get_schema(_):\n",
|
||||||
|
" return db.get_table_info()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 22,
|
||||||
|
"id": "a4eda902",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def run_query(query):\n",
|
||||||
|
" return db.run(query)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 23,
|
||||||
|
"id": "5046cb17",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from operator import itemgetter\n",
|
||||||
|
"\n",
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"from langchain.schema.output_parser import StrOutputParser\n",
|
||||||
|
"from langchain.schema.runnable import RunnableLambda, RunnableMap\n",
|
||||||
|
"\n",
|
||||||
|
"model = ChatOpenAI()\n",
|
||||||
|
"\n",
|
||||||
|
"inputs = {\n",
|
||||||
|
" \"schema\": RunnableLambda(get_schema),\n",
|
||||||
|
" \"question\": itemgetter(\"question\")\n",
|
||||||
|
"}\n",
|
||||||
|
"sql_response = (\n",
|
||||||
|
" RunnableMap(inputs)\n",
|
||||||
|
" | prompt\n",
|
||||||
|
" | model.bind(stop=[\"\\nSQLResult:\"])\n",
|
||||||
|
" | StrOutputParser()\n",
|
||||||
|
" )"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 24,
|
||||||
|
"id": "a5552039",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'SELECT COUNT(*) FROM Employee'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 24,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"sql_response.invoke({\"question\": \"How many employees are there?\"})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 25,
|
||||||
|
"id": "d6fee130",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"template = \"\"\"Based on the table schema below, question, sql query, and sql response, write a natural language response:\n",
|
||||||
|
"{schema}\n",
|
||||||
|
"\n",
|
||||||
|
"Question: {question}\n",
|
||||||
|
"SQL Query: {query}\n",
|
||||||
|
"SQL Response: {response}\"\"\"\n",
|
||||||
|
"prompt_response = ChatPromptTemplate.from_template(template)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 26,
|
||||||
|
"id": "923aa634",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"full_chain = (\n",
|
||||||
|
" RunnableMap({\n",
|
||||||
|
" \"question\": itemgetter(\"question\"),\n",
|
||||||
|
" \"query\": sql_response,\n",
|
||||||
|
" }) \n",
|
||||||
|
" | {\n",
|
||||||
|
" \"schema\": RunnableLambda(get_schema),\n",
|
||||||
|
" \"question\": itemgetter(\"question\"),\n",
|
||||||
|
" \"query\": itemgetter(\"query\"),\n",
|
||||||
|
" \"response\": lambda x: db.run(x[\"query\"]) \n",
|
||||||
|
" } \n",
|
||||||
|
" | prompt_response \n",
|
||||||
|
" | model\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 27,
|
||||||
|
"id": "e94963d8",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"AIMessage(content='There are 8 employees.', additional_kwargs={}, example=False)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 27,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"full_chain.invoke({\"question\": \"How many employees are there?\"})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "4f358d7b-a721-4db3-9f92-f06913428afc",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,122 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "29781123",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Using tools\n",
|
||||||
|
"\n",
|
||||||
|
"You can use any Tools with Runnables easily."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a5c579dd-2e22-41b0-a789-346dfdecb5a2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"!pip install duckduckgo-search"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "9232d2a9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"from langchain.prompts import ChatPromptTemplate\n",
|
||||||
|
"from langchain.schema.output_parser import StrOutputParser\n",
|
||||||
|
"from langchain.tools import DuckDuckGoSearchRun"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "a0c64d2c",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"search = DuckDuckGoSearchRun()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "391969b6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"template = \"\"\"turn the following user input into a search query for a search engine:\n",
|
||||||
|
"\n",
|
||||||
|
"{input}\"\"\"\n",
|
||||||
|
"prompt = ChatPromptTemplate.from_template(template)\n",
|
||||||
|
"\n",
|
||||||
|
"model = ChatOpenAI()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "e3d9d20d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"chain = prompt | model | StrOutputParser() | search"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "55f2967d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'What sports games are on TV today & tonight? Watch and stream live sports on TV today, tonight, tomorrow. Today\\'s 2023 sports TV schedule includes football, basketball, baseball, hockey, motorsports, soccer and more. Watch on TV or stream online on ESPN, FOX, FS1, CBS, NBC, ABC, Peacock, Paramount+, fuboTV, local channels and many other networks. MLB Games Tonight: How to Watch on TV, Streaming & Odds - Thursday, September 7. Seattle Mariners\\' Julio Rodriguez greets teammates in the dugout after scoring against the Oakland Athletics in a ... Circle - Country Music and Lifestyle. Live coverage of all the MLB action today is available to you, with the information provided below. The Brewers will look to pick up a road win at PNC Park against the Pirates on Wednesday at 12:35 PM ET. Check out the latest odds and with BetMGM Sportsbook. Use bonus code \"GNPLAY\" for special offers! MLB Games Tonight: How to Watch on TV, Streaming & Odds - Tuesday, September 5. Houston Astros\\' Kyle Tucker runs after hitting a double during the fourth inning of a baseball game against the Los Angeles Angels, Sunday, Aug. 13, 2023, in Houston. (AP Photo/Eric Christian Smith) (APMedia) The Houston Astros versus the Texas Rangers is one of ... The second half of tonight\\'s college football schedule still has some good games remaining to watch on your television.. We\\'ve already seen an exciting one when Colorado upset TCU. And we saw some ...'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain.invoke({\"input\": \"I'd like to figure out what games are tonight\"})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "a16949cf-00ea-43c6-a6aa-797ad4f6918d",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "poetry-venv",
|
||||||
|
"language": "python",
|
||||||
|
"name": "poetry-venv"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,2 @@
|
|||||||
|
label: 'How to'
|
||||||
|
position: 1
|
@ -0,0 +1,158 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "fbc4bf6e",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Run arbitrary functions\n",
|
||||||
|
"\n",
|
||||||
|
"You can use arbitrary functions in the pipeline\n",
|
||||||
|
"\n",
|
||||||
|
"Note that all inputs to these functions need to be a SINGLE argument. If you have a function that accepts multiple arguments, you should write a wrapper that accepts a single input and unpacks it into multiple argument."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 77,
|
||||||
|
"id": "6bb221b3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.schema.runnable import RunnableLambda\n",
|
||||||
|
"\n",
|
||||||
|
"def length_function(text):\n",
|
||||||
|
" return len(text)\n",
|
||||||
|
"\n",
|
||||||
|
"def _multiple_length_function(text1, text2):\n",
|
||||||
|
" return len(text1) * len(text2)\n",
|
||||||
|
"\n",
|
||||||
|
"def multiple_length_function(_dict):\n",
|
||||||
|
" return _multiple_length_function(_dict[\"text1\"], _dict[\"text2\"])\n",
|
||||||
|
"\n",
|
||||||
|
"prompt = ChatPromptTemplate.from_template(\"what is {a} + {b}\")\n",
|
||||||
|
"\n",
|
||||||
|
"chain1 = prompt | model\n",
|
||||||
|
"\n",
|
||||||
|
"chain = {\n",
|
||||||
|
" \"a\": itemgetter(\"foo\") | RunnableLambda(length_function),\n",
|
||||||
|
" \"b\": {\"text1\": itemgetter(\"foo\"), \"text2\": itemgetter(\"bar\")} | RunnableLambda(multiple_length_function)\n",
|
||||||
|
"} | prompt | model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 78,
|
||||||
|
"id": "5488ec85",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"AIMessage(content='3 + 9 equals 12.', additional_kwargs={}, example=False)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 78,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain.invoke({\"foo\": \"bar\", \"bar\": \"gah\"})"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "4728ddd9-914d-42ce-ae9b-72c9ce8ec940",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Accepting a Runnable Config\n",
|
||||||
|
"\n",
|
||||||
|
"Runnable lambdas can optionally accept a [RunnableConfig](https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.config.RunnableConfig.html?highlight=runnableconfig#langchain.schema.runnable.config.RunnableConfig), which they can use to pass callbacks, tags, and other configuration information to nested runs."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 139,
|
||||||
|
"id": "80b3b5f6-5d58-44b9-807e-cce9a46bf49f",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.schema.runnable import RunnableConfig"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 149,
|
||||||
|
"id": "ff0daf0c-49dd-4d21-9772-e5fa133c5f36",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import json\n",
|
||||||
|
"\n",
|
||||||
|
"def parse_or_fix(text: str, config: RunnableConfig):\n",
|
||||||
|
" fixing_chain = (\n",
|
||||||
|
" ChatPromptTemplate.from_template(\n",
|
||||||
|
" \"Fix the following text:\\n\\n```text\\n{input}\\n```\\nError: {error}\"\n",
|
||||||
|
" \" Don't narrate, just respond with the fixed data.\"\n",
|
||||||
|
" )\n",
|
||||||
|
" | ChatOpenAI()\n",
|
||||||
|
" | StrOutputParser()\n",
|
||||||
|
" )\n",
|
||||||
|
" for _ in range(3):\n",
|
||||||
|
" try:\n",
|
||||||
|
" return json.loads(text)\n",
|
||||||
|
" except Exception as e:\n",
|
||||||
|
" text = fixing_chain.invoke({\"input\": text, \"error\": e}, config)\n",
|
||||||
|
" return \"Failed to parse\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 152,
|
||||||
|
"id": "1a5e709e-9d75-48c7-bb9c-503251990505",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Tokens Used: 65\n",
|
||||||
|
"\tPrompt Tokens: 56\n",
|
||||||
|
"\tCompletion Tokens: 9\n",
|
||||||
|
"Successful Requests: 1\n",
|
||||||
|
"Total Cost (USD): $0.00010200000000000001\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from langchain.callbacks import get_openai_callback\n",
|
||||||
|
"\n",
|
||||||
|
"with get_openai_callback() as cb:\n",
|
||||||
|
" RunnableLambda(parse_or_fix).invoke(\"{foo: bar}\", {\"tags\": [\"my-tag\"], \"callbacks\": [cb]})\n",
|
||||||
|
" print(cb)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,520 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Mutli-language data anonymization with Microsoft Presidio\n",
|
||||||
|
"\n",
|
||||||
|
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/presidio_data_anonymization/multi_language.ipynb)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"## Use case\n",
|
||||||
|
"\n",
|
||||||
|
"Multi-language support in data pseudonymization is essential due to differences in language structures and cultural contexts. Different languages may have varying formats for personal identifiers. For example, the structure of names, locations and dates can differ greatly between languages and regions. Furthermore, non-alphanumeric characters, accents, and the direction of writing can impact pseudonymization processes. Without multi-language support, data could remain identifiable or be misinterpreted, compromising data privacy and accuracy. Hence, it enables effective and precise pseudonymization suited for global operations.\n",
|
||||||
|
"\n",
|
||||||
|
"## Overview\n",
|
||||||
|
"\n",
|
||||||
|
"PII detection in Microsoft Presidio relies on several components - in addition to the usual pattern matching (e.g. using regex), the analyser uses a model for Named Entity Recognition (NER) to extract entities such as:\n",
|
||||||
|
"- `PERSON`\n",
|
||||||
|
"- `LOCATION`\n",
|
||||||
|
"- `DATE_TIME`\n",
|
||||||
|
"- `NRP`\n",
|
||||||
|
"- `ORGANIZATION`\n",
|
||||||
|
"\n",
|
||||||
|
"[[Source]](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py)\n",
|
||||||
|
"\n",
|
||||||
|
"To handle NER in specific languages, we utilize unique models from the `spaCy` library, recognized for its extensive selection covering multiple languages and sizes. However, it's not restrictive, allowing for integration of alternative frameworks such as [Stanza](https://microsoft.github.io/presidio/analyzer/nlp_engines/spacy_stanza/) or [transformers](https://microsoft.github.io/presidio/analyzer/nlp_engines/transformers/) when necessary.\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"## Quickstart\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Install necessary packages\n",
|
||||||
|
"# ! pip install langchain langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker\n",
|
||||||
|
"# ! python -m spacy download en_core_web_lg"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n",
|
||||||
|
"\n",
|
||||||
|
"anonymizer = PresidioReversibleAnonymizer(\n",
|
||||||
|
" analyzed_fields=[\"PERSON\"],\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"By default, `PresidioAnonymizer` and `PresidioReversibleAnonymizer` use a model trained on English texts, so they handle other languages moderately well. \n",
|
||||||
|
"\n",
|
||||||
|
"For example, here the model did not detect the person:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'Me llamo Sofía'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"anonymizer.anonymize(\"Me llamo Sofía\") # \"My name is Sofía\" in Spanish"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"They may also take words from another language as actual entities. Here, both the word *'Yo'* (*'I'* in Spanish) and *Sofía* have been classified as `PERSON`:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'Bridget Kirk soy Sally Knight'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"anonymizer.anonymize(\"Yo soy Sofía\") # \"I am Sofía\" in Spanish"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"If you want to anonymise texts from other languages, you need to download other models and add them to the anonymiser configuration:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Download the models for the languages you want to use\n",
|
||||||
|
"# ! python -m spacy download en_core_web_md\n",
|
||||||
|
"# ! python -m spacy download es_core_news_md"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"nlp_config = {\n",
|
||||||
|
" \"nlp_engine_name\": \"spacy\",\n",
|
||||||
|
" \"models\": [\n",
|
||||||
|
" {\"lang_code\": \"en\", \"model_name\": \"en_core_web_md\"},\n",
|
||||||
|
" {\"lang_code\": \"es\", \"model_name\": \"es_core_news_md\"},\n",
|
||||||
|
" ],\n",
|
||||||
|
"}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We have therefore added a Spanish language model. Note also that we have downloaded an alternative model for English as well - in this case we have replaced the large model `en_core_web_lg` (560MB) with its smaller version `en_core_web_md` (40MB) - the size is therefore reduced by 14 times! If you care about the speed of anonymisation, it is worth considering it.\n",
|
||||||
|
"\n",
|
||||||
|
"All models for the different languages can be found in the [spaCy documentation](https://spacy.io/usage/models).\n",
|
||||||
|
"\n",
|
||||||
|
"Now pass the configuration as the `languages_config` parameter to Anonymiser. As you can see, both previous examples work flawlessly:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Me llamo Michelle Smith\n",
|
||||||
|
"Yo soy Rachel Wright\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"anonymizer = PresidioReversibleAnonymizer(\n",
|
||||||
|
" analyzed_fields=[\"PERSON\"],\n",
|
||||||
|
" languages_config=nlp_config,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"print(\n",
|
||||||
|
" anonymizer.anonymize(\"Me llamo Sofía\", language=\"es\")\n",
|
||||||
|
") # \"My name is Sofía\" in Spanish\n",
|
||||||
|
"print(anonymizer.anonymize(\"Yo soy Sofía\", language=\"es\")) # \"I am Sofía\" in Spanish"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"By default, the language indicated first in the configuration will be used when anonymising text (in this case English):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"My name is Ronnie Ayala\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(anonymizer.anonymize(\"My name is John\"))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Advanced usage\n",
|
||||||
|
"\n",
|
||||||
|
"### Custom labels in NER model"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"It may be that the spaCy model has different class names than those supported by the Microsoft Presidio by default. Take Polish, for example:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Text: Wiktoria, Start: 12, End: 20, Label: persName\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# ! python -m spacy download pl_core_news_md\n",
|
||||||
|
"\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"\n",
|
||||||
|
"nlp = spacy.load(\"pl_core_news_md\")\n",
|
||||||
|
"doc = nlp(\"Nazywam się Wiktoria\") # \"My name is Wiktoria\" in Polish\n",
|
||||||
|
"\n",
|
||||||
|
"for ent in doc.ents:\n",
|
||||||
|
" print(\n",
|
||||||
|
" f\"Text: {ent.text}, Start: {ent.start_char}, End: {ent.end_char}, Label: {ent.label_}\"\n",
|
||||||
|
" )"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The name *Victoria* was classified as `persName`, which does not correspond to the default class names `PERSON`/`PER` implemented in Microsoft Presidio (look for `CHECK_LABEL_GROUPS` in [SpacyRecognizer implementation](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py)). \n",
|
||||||
|
"\n",
|
||||||
|
"You can find out more about custom labels in spaCy models (including your own, trained ones) in [this thread](https://github.com/microsoft/presidio/issues/851).\n",
|
||||||
|
"\n",
|
||||||
|
"That's why our sentence will not be anonymized:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Nazywam się Wiktoria\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"nlp_config = {\n",
|
||||||
|
" \"nlp_engine_name\": \"spacy\",\n",
|
||||||
|
" \"models\": [\n",
|
||||||
|
" {\"lang_code\": \"en\", \"model_name\": \"en_core_web_md\"},\n",
|
||||||
|
" {\"lang_code\": \"es\", \"model_name\": \"es_core_news_md\"},\n",
|
||||||
|
" {\"lang_code\": \"pl\", \"model_name\": \"pl_core_news_md\"},\n",
|
||||||
|
" ],\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"anonymizer = PresidioReversibleAnonymizer(\n",
|
||||||
|
" analyzed_fields=[\"PERSON\", \"LOCATION\", \"DATE_TIME\"],\n",
|
||||||
|
" languages_config=nlp_config,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"print(\n",
|
||||||
|
" anonymizer.anonymize(\"Nazywam się Wiktoria\", language=\"pl\")\n",
|
||||||
|
") # \"My name is Wiktoria\" in Polish"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"To address this, create your own `SpacyRecognizer` with your own class mapping and add it to the anonymizer:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from presidio_analyzer.predefined_recognizers import SpacyRecognizer\n",
|
||||||
|
"\n",
|
||||||
|
"polish_check_label_groups = [\n",
|
||||||
|
" ({\"LOCATION\"}, {\"placeName\", \"geogName\"}),\n",
|
||||||
|
" ({\"PERSON\"}, {\"persName\"}),\n",
|
||||||
|
" ({\"DATE_TIME\"}, {\"date\", \"time\"}),\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"spacy_recognizer = SpacyRecognizer(\n",
|
||||||
|
" supported_language=\"pl\",\n",
|
||||||
|
" check_label_groups=polish_check_label_groups,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"anonymizer.add_recognizer(spacy_recognizer)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Now everything works smoothly:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Nazywam się Morgan Walters\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(\n",
|
||||||
|
" anonymizer.anonymize(\"Nazywam się Wiktoria\", language=\"pl\")\n",
|
||||||
|
") # \"My name is Wiktoria\" in Polish"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Let's try on more complex example:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Nazywam się Ernest Liu. New Taylorburgh to moje miasto rodzinne. Urodziłam się 1987-01-19\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(\n",
|
||||||
|
" anonymizer.anonymize(\n",
|
||||||
|
" \"Nazywam się Wiktoria. Płock to moje miasto rodzinne. Urodziłam się dnia 6 kwietnia 2001 roku\",\n",
|
||||||
|
" language=\"pl\",\n",
|
||||||
|
" )\n",
|
||||||
|
") # \"My name is Wiktoria. Płock is my home town. I was born on 6 April 2001\" in Polish"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"As you can see, thanks to class mapping, the anonymiser can cope with different types of entities. "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Custom language-specific operators\n",
|
||||||
|
"\n",
|
||||||
|
"In the example above, the sentence has been anonymised correctly, but the fake data does not fit the Polish language at all. Custom operators can therefore be added, which will resolve the issue:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from faker import Faker\n",
|
||||||
|
"from presidio_anonymizer.entities import OperatorConfig\n",
|
||||||
|
"\n",
|
||||||
|
"fake = Faker(locale=\"pl_PL\") # Setting faker to provide Polish data\n",
|
||||||
|
"\n",
|
||||||
|
"new_operators = {\n",
|
||||||
|
" \"PERSON\": OperatorConfig(\"custom\", {\"lambda\": lambda _: fake.first_name_female()}),\n",
|
||||||
|
" \"LOCATION\": OperatorConfig(\"custom\", {\"lambda\": lambda _: fake.city()}),\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"anonymizer.add_operators(new_operators)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Nazywam się Marianna. Szczecin to moje miasto rodzinne. Urodziłam się 1976-11-16\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(\n",
|
||||||
|
" anonymizer.anonymize(\n",
|
||||||
|
" \"Nazywam się Wiktoria. Płock to moje miasto rodzinne. Urodziłam się dnia 6 kwietnia 2001 roku\",\n",
|
||||||
|
" language=\"pl\",\n",
|
||||||
|
" )\n",
|
||||||
|
") # \"My name is Wiktoria. Płock is my home town. I was born on 6 April 2001\" in Polish"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Limitations\n",
|
||||||
|
"\n",
|
||||||
|
"Remember - results are as good as your recognizers and as your NER models!\n",
|
||||||
|
"\n",
|
||||||
|
"Look at the example below - we downloaded the small model for Spanish (12MB) and it no longer performs as well as the medium version (40MB):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 16,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Model: es_core_news_sm. Result: Me llamo Sofía\n",
|
||||||
|
"Model: es_core_news_md. Result: Me llamo Lawrence Davis\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# ! python -m spacy download es_core_news_sm\n",
|
||||||
|
"\n",
|
||||||
|
"for model in [\"es_core_news_sm\", \"es_core_news_md\"]:\n",
|
||||||
|
" nlp_config = {\n",
|
||||||
|
" \"nlp_engine_name\": \"spacy\",\n",
|
||||||
|
" \"models\": [\n",
|
||||||
|
" {\"lang_code\": \"es\", \"model_name\": model},\n",
|
||||||
|
" ],\n",
|
||||||
|
" }\n",
|
||||||
|
"\n",
|
||||||
|
" anonymizer = PresidioReversibleAnonymizer(\n",
|
||||||
|
" analyzed_fields=[\"PERSON\"],\n",
|
||||||
|
" languages_config=nlp_config,\n",
|
||||||
|
" )\n",
|
||||||
|
"\n",
|
||||||
|
" print(\n",
|
||||||
|
" f\"Model: {model}. Result: {anonymizer.anonymize('Me llamo Sofía', language='es')}\"\n",
|
||||||
|
" )"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"In many cases, even the larger models from spaCy will not be sufficient - there are already other, more complex and better methods of detecting named entities, based on transformers. You can read more about this [here](https://microsoft.github.io/presidio/analyzer/nlp_engines/transformers/)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Future works\n",
|
||||||
|
"\n",
|
||||||
|
"- **automatic language detection** - instead of passing the language as a parameter in `anonymizer.anonymize`, we could detect the language/s beforehand and then use the corresponding NER model."
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
@ -0,0 +1,461 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Reversible data anonymization with Microsoft Presidio\n",
|
||||||
|
"\n",
|
||||||
|
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb)\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"## Use case\n",
|
||||||
|
"\n",
|
||||||
|
"We have already written about the importance of anonymizing sensitive data in the previous section. **Reversible Anonymization** is an equally essential technology while sharing information with language models, as it balances data protection with data usability. This technique involves masking sensitive personally identifiable information (PII), yet it can be reversed and original data can be restored when authorized users need it. Its main advantage lies in the fact that while it conceals individual identities to prevent misuse, it also allows the concealed data to be accurately unmasked should it be necessary for legal or compliance purposes. \n",
|
||||||
|
"\n",
|
||||||
|
"## Overview\n",
|
||||||
|
"\n",
|
||||||
|
"We implemented the `PresidioReversibleAnonymizer`, which consists of two parts:\n",
|
||||||
|
"\n",
|
||||||
|
"1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example:\n",
|
||||||
|
"```\n",
|
||||||
|
" {\n",
|
||||||
|
" \"PERSON\": {\n",
|
||||||
|
" \"<anonymized>\": \"<original>\",\n",
|
||||||
|
" \"John Doe\": \"Slim Shady\"\n",
|
||||||
|
" },\n",
|
||||||
|
" \"PHONE_NUMBER\": {\n",
|
||||||
|
" \"111-111-1111\": \"555-555-5555\"\n",
|
||||||
|
" }\n",
|
||||||
|
" ...\n",
|
||||||
|
" }\n",
|
||||||
|
"```\n",
|
||||||
|
"\n",
|
||||||
|
"2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it.\n",
|
||||||
|
"\n",
|
||||||
|
"Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM.\n",
|
||||||
|
"\n",
|
||||||
|
"## Quickstart\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Install necessary packages\n",
|
||||||
|
"# ! pip install langchain langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker\n",
|
||||||
|
"# ! python -m spacy download en_core_web_lg"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"`PresidioReversibleAnonymizer` is not significantly different from its predecessor (`PresidioAnonymizer`) in terms of anonymization:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'My name is Maria Lynch, call me at 7344131647 or email me at jamesmichael@example.com. By the way, my card number is: 4838637940262'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n",
|
||||||
|
"\n",
|
||||||
|
"anonymizer = PresidioReversibleAnonymizer(\n",
|
||||||
|
" analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n",
|
||||||
|
" # Faker seed is used here to make sure the same fake data is generated for the test purposes\n",
|
||||||
|
" # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n",
|
||||||
|
" faker_seed=42,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"anonymizer.anonymize(\n",
|
||||||
|
" \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n",
|
||||||
|
" \"By the way, my card number is: 4916 0387 9536 0861\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This is what the full string we want to deanonymize looks like:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Maria Lynch recently lost his wallet. \n",
|
||||||
|
"Inside is some cash and his credit card with the number 4838637940262. \n",
|
||||||
|
"If you would find it, please call at 7344131647 or write an email here: jamesmichael@example.com.\n",
|
||||||
|
"Maria Lynch would be very grateful!\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# We know this data, as we set the faker_seed parameter\n",
|
||||||
|
"fake_name = \"Maria Lynch\"\n",
|
||||||
|
"fake_phone = \"7344131647\"\n",
|
||||||
|
"fake_email = \"jamesmichael@example.com\"\n",
|
||||||
|
"fake_credit_card = \"4838637940262\"\n",
|
||||||
|
"\n",
|
||||||
|
"anonymized_text = f\"\"\"{fake_name} recently lost his wallet. \n",
|
||||||
|
"Inside is some cash and his credit card with the number {fake_credit_card}. \n",
|
||||||
|
"If you would find it, please call at {fake_phone} or write an email here: {fake_email}.\n",
|
||||||
|
"{fake_name} would be very grateful!\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"print(anonymized_text)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"And now, using the `deanonymize` method, we can reverse the process:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Slim Shady recently lost his wallet. \n",
|
||||||
|
"Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n",
|
||||||
|
"If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\n",
|
||||||
|
"Slim Shady would be very grateful!\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(anonymizer.deanonymize(anonymized_text))"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"### Using with LangChain Expression Language\n",
|
||||||
|
"\n",
|
||||||
|
"With LCEL we can easily chain together anonymization and deanonymization with the rest of our application. This is an example of using the anonymization mechanism with a query to LLM (without deanonymization for now):"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"text = f\"\"\"Slim Shady recently lost his wallet. \n",
|
||||||
|
"Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n",
|
||||||
|
"If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\"\"\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Dear Sir/Madam,\n",
|
||||||
|
"\n",
|
||||||
|
"We regret to inform you that Mr. Dana Rhodes has reported the loss of his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4397528473885757. \n",
|
||||||
|
"\n",
|
||||||
|
"If you happen to come across the aforementioned wallet, we kindly request that you contact us immediately at 258-481-7074x714 or via email at laurengoodman@example.com.\n",
|
||||||
|
"\n",
|
||||||
|
"Your prompt assistance in this matter would be greatly appreciated.\n",
|
||||||
|
"\n",
|
||||||
|
"Yours faithfully,\n",
|
||||||
|
"\n",
|
||||||
|
"[Your Name]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from langchain.prompts.prompt import PromptTemplate\n",
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"\n",
|
||||||
|
"anonymizer = PresidioReversibleAnonymizer()\n",
|
||||||
|
"\n",
|
||||||
|
"template = \"\"\"Rewrite this text into an official, short email:\n",
|
||||||
|
"\n",
|
||||||
|
"{anonymized_text}\"\"\"\n",
|
||||||
|
"prompt = PromptTemplate.from_template(template)\n",
|
||||||
|
"llm = ChatOpenAI(temperature=0)\n",
|
||||||
|
"\n",
|
||||||
|
"chain = {\"anonymized_text\": anonymizer.anonymize} | prompt | llm\n",
|
||||||
|
"response = chain.invoke(text)\n",
|
||||||
|
"print(response.content)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Now, let's add **deanonymization step** to our sequence:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Dear Sir/Madam,\n",
|
||||||
|
"\n",
|
||||||
|
"We regret to inform you that Mr. Slim Shady has recently misplaced his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4916 0387 9536 0861. \n",
|
||||||
|
"\n",
|
||||||
|
"If by any chance you come across the lost wallet, kindly contact us immediately at 313-666-7440 or send an email to real.slim.shady@gmail.com.\n",
|
||||||
|
"\n",
|
||||||
|
"Your prompt assistance in this matter would be greatly appreciated.\n",
|
||||||
|
"\n",
|
||||||
|
"Yours faithfully,\n",
|
||||||
|
"\n",
|
||||||
|
"[Your Name]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain = chain | (lambda ai_message: anonymizer.deanonymize(ai_message.content))\n",
|
||||||
|
"response = chain.invoke(text)\n",
|
||||||
|
"print(response)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Anonymized data was given to the model itself, and therefore it was protected from being leaked to the outside world. Then, the model's response was processed, and the factual value was replaced with the real one."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Extra knowledge"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"`PresidioReversibleAnonymizer` stores the mapping of the fake values to the original values in the `deanonymizer_mapping` parameter, where key is fake PII and value is the original one: "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'PERSON': {'Maria Lynch': 'Slim Shady'},\n",
|
||||||
|
" 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n",
|
||||||
|
" 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n",
|
||||||
|
" 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861'}}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n",
|
||||||
|
"\n",
|
||||||
|
"anonymizer = PresidioReversibleAnonymizer(\n",
|
||||||
|
" analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n",
|
||||||
|
" # Faker seed is used here to make sure the same fake data is generated for the test purposes\n",
|
||||||
|
" # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n",
|
||||||
|
" faker_seed=42,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"anonymizer.anonymize(\n",
|
||||||
|
" \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n",
|
||||||
|
" \"By the way, my card number is: 4916 0387 9536 0861\"\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"anonymizer.deanonymizer_mapping"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Anonymizing more texts will result in new mapping entries:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"Do you have his VISA card number? Yep, it's 3537672423884966. I'm William Bowman by the way.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n",
|
||||||
|
" 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n",
|
||||||
|
" 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n",
|
||||||
|
" 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n",
|
||||||
|
" '3537672423884966': '4001 9192 5753 7193'}}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"print(\n",
|
||||||
|
" anonymizer.anonymize(\n",
|
||||||
|
" \"Do you have his VISA card number? Yep, it's 4001 9192 5753 7193. I'm John Doe by the way.\"\n",
|
||||||
|
" )\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"anonymizer.deanonymizer_mapping"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We can save the mapping itself to a file for future use: "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# We can save the deanonymizer mapping as a JSON or YAML file\n",
|
||||||
|
"\n",
|
||||||
|
"anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n",
|
||||||
|
"# anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.yaml\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"And then, load it in another `PresidioReversibleAnonymizer` instance:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"anonymizer = PresidioReversibleAnonymizer()\n",
|
||||||
|
"\n",
|
||||||
|
"anonymizer.deanonymizer_mapping"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n",
|
||||||
|
" 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n",
|
||||||
|
" 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n",
|
||||||
|
" 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n",
|
||||||
|
" '3537672423884966': '4001 9192 5753 7193'}}"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"anonymizer.load_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n",
|
||||||
|
"\n",
|
||||||
|
"anonymizer.deanonymizer_mapping"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Future works\n",
|
||||||
|
"\n",
|
||||||
|
"- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object.\n",
|
||||||
|
"- **better matching and substitution of fake values for real ones** - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. *John Doe* -> *John* or *Main St, New York* -> *New York*) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs."
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Konko\n",
|
||||||
|
"\n",
|
||||||
|
">[Konko](https://www.konko.ai/) API is a fully managed Web API designed to help application developers:\n",
|
||||||
|
"\n",
|
||||||
|
"Konko API is a fully managed API designed to help application developers:\n",
|
||||||
|
"\n",
|
||||||
|
"1. Select the right LLM(s) for their application\n",
|
||||||
|
"2. Prototype with various open-source and proprietary LLMs\n",
|
||||||
|
"3. Move to production in-line with their security, privacy, throughput, latency SLAs without infrastructure set-up or administration using Konko AI's SOC 2 compliant infrastructure\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"This example goes over how to use LangChain to interact with `Konko` [models](https://docs.konko.ai/docs/overview)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"To run this notebook, you'll need Konko API key. You can request it by messaging support@konko.ai."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chat_models import ChatKonko\n",
|
||||||
|
"from langchain.prompts.chat import (\n",
|
||||||
|
" ChatPromptTemplate,\n",
|
||||||
|
" SystemMessagePromptTemplate,\n",
|
||||||
|
" AIMessagePromptTemplate,\n",
|
||||||
|
" HumanMessagePromptTemplate,\n",
|
||||||
|
")\n",
|
||||||
|
"from langchain.schema import AIMessage, HumanMessage, SystemMessage"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## 2. Set API Keys\n",
|
||||||
|
"\n",
|
||||||
|
"<br />\n",
|
||||||
|
"\n",
|
||||||
|
"### Option 1: Set Environment Variables\n",
|
||||||
|
"\n",
|
||||||
|
"1. You can set environment variables for \n",
|
||||||
|
" 1. KONKO_API_KEY (Required)\n",
|
||||||
|
" 2. OPENAI_API_KEY (Optional)\n",
|
||||||
|
"2. In your current shell session, use the export command:\n",
|
||||||
|
"\n",
|
||||||
|
"```shell\n",
|
||||||
|
"export KONKO_API_KEY={your_KONKO_API_KEY_here}\n",
|
||||||
|
"export OPENAI_API_KEY={your_OPENAI_API_KEY_here} #Optional\n",
|
||||||
|
"```\n",
|
||||||
|
"\n",
|
||||||
|
"Alternatively, you can add the above lines directly to your shell startup script (such as .bashrc or .bash_profile for Bash shell and .zshrc for Zsh shell) to have them set automatically every time a new shell session starts.\n",
|
||||||
|
"\n",
|
||||||
|
"### Option 2: Set API Keys Programmatically\n",
|
||||||
|
"\n",
|
||||||
|
"If you prefer to set your API keys directly within your Python script or Jupyter notebook, you can use the following commands:\n",
|
||||||
|
"\n",
|
||||||
|
"```python\n",
|
||||||
|
"konko.set_api_key('your_KONKO_API_KEY_here') \n",
|
||||||
|
"konko.set_openai_api_key('your_OPENAI_API_KEY_here') # Optional\n",
|
||||||
|
"```\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Calling a model\n",
|
||||||
|
"\n",
|
||||||
|
"Find a model on the [Konko overview page](https://docs.konko.ai/docs/overview)\n",
|
||||||
|
"\n",
|
||||||
|
"For example, for this [LLama 2 model](https://docs.konko.ai/docs/meta-llama-2-13b-chat). The model id would be: `\"meta-llama/Llama-2-13b-chat-hf\"`\n",
|
||||||
|
"\n",
|
||||||
|
"Another way to find the list of models running on the Konko instance is through this [endpoint](https://docs.konko.ai/reference/listmodels).\n",
|
||||||
|
"\n",
|
||||||
|
"From here, we can initialize our model:\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"chat = ChatKonko(max_tokens=400, model = 'meta-llama/Llama-2-13b-chat-hf')"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"AIMessage(content=\" Sure, I'd be happy to explain the Big Bang Theory briefly!\\n\\nThe Big Bang Theory is the leading explanation for the origin and evolution of the universe, based on a vast amount of observational evidence from many fields of science. In essence, the theory posits that the universe began as an infinitely hot and dense point, known as a singularity, around 13.8 billion years ago. This singularity expanded rapidly, and as it did, it cooled and formed subatomic particles, which eventually coalesced into the first atoms, and later into the stars and galaxies we see today.\\n\\nThe theory gets its name from the idea that the universe began in a state of incredibly high energy and temperature, and has been expanding and cooling ever since. This expansion is thought to have been driven by a mysterious force known as dark energy, which is thought to be responsible for the accelerating expansion of the universe.\\n\\nOne of the key predictions of the Big Bang Theory is that the universe should be homogeneous and isotropic on large scales, meaning that it should look the same in all directions and have the same properties everywhere. This prediction has been confirmed by a wealth of observational evidence, including the cosmic microwave background radiation, which is thought to be a remnant of the early universe.\\n\\nOverall, the Big Bang Theory is a well-established and widely accepted explanation for the origins of the universe, and it has been supported by a vast amount of observational evidence from many fields of science.\", additional_kwargs={}, example=False)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"messages = [\n",
|
||||||
|
" SystemMessage(\n",
|
||||||
|
" content=\"You are a helpful assistant.\"\n",
|
||||||
|
" ),\n",
|
||||||
|
" HumanMessage(\n",
|
||||||
|
" content=\"Explain Big Bang Theory briefly\"\n",
|
||||||
|
" ),\n",
|
||||||
|
"]\n",
|
||||||
|
"chat(messages)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.3"
|
||||||
|
},
|
||||||
|
"vscode": {
|
||||||
|
"interpreter": {
|
||||||
|
"hash": "a0a0263b650d907a3bfe41c0f8d6a63a071b884df3cfdc1579f00cdc1aed6b03"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
@ -1,79 +1,72 @@
|
|||||||
# Banana
|
# Banana
|
||||||
|
|
||||||
This page covers how to use the Banana ecosystem within LangChain.
|
Banana provided serverless GPU inference for AI models, including a CI/CD build pipeline and a simple Python framework (Potassium) to server your models.
|
||||||
It is broken into two parts: installation and setup, and then references to specific Banana wrappers.
|
|
||||||
|
This page covers how to use the [Banana](https://www.banana.dev) ecosystem within LangChain.
|
||||||
|
|
||||||
|
It is broken into two parts:
|
||||||
|
* installation and setup,
|
||||||
|
* and then references to specific Banana wrappers.
|
||||||
|
|
||||||
## Installation and Setup
|
## Installation and Setup
|
||||||
|
|
||||||
- Install with `pip install banana-dev`
|
- Install with `pip install banana-dev`
|
||||||
- Get an Banana api key and set it as an environment variable (`BANANA_API_KEY`)
|
- Get an Banana api key from the [Banana.dev dashboard](https://app.banana.dev) and set it as an environment variable (`BANANA_API_KEY`)
|
||||||
|
- Get your model's key and url slug from the model's details page
|
||||||
|
|
||||||
## Define your Banana Template
|
## Define your Banana Template
|
||||||
|
|
||||||
If you want to use an available language model template you can find one [here](https://app.banana.dev/templates/conceptofmind/serverless-template-palmyra-base).
|
You'll need to set up a Github repo for your Banana app. You can get started in 5 minutes using [this guide](https://docs.banana.dev/banana-docs/).
|
||||||
This template uses the Palmyra-Base model by [Writer](https://writer.com/product/api/).
|
|
||||||
You can check out an example Banana repository [here](https://github.com/conceptofmind/serverless-template-palmyra-base).
|
Alternatively, for a ready-to-go LLM example, you can check out Banana's [CodeLlama-7B-Instruct-GPTQ](https://github.com/bananaml/demo-codellama-7b-instruct-gptq) GitHub repository. Just fork it and deploy it within Banana.
|
||||||
|
|
||||||
|
Other starter repos are available [here](https://github.com/orgs/bananaml/repositories?q=demo-&type=all&language=&sort=).
|
||||||
|
|
||||||
## Build the Banana app
|
## Build the Banana app
|
||||||
|
|
||||||
Banana Apps must include the "output" key in the return json.
|
To use Banana apps within Langchain, they must include the `outputs` key
|
||||||
There is a rigid response structure.
|
in the returned json, and the value must be a string.
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Return the results as a dictionary
|
# Return the results as a dictionary
|
||||||
result = {'output': result}
|
result = {'outputs': result}
|
||||||
```
|
```
|
||||||
|
|
||||||
An example inference function would be:
|
An example inference function would be:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
def inference(model_inputs:dict) -> dict:
|
@app.handler("/")
|
||||||
global model
|
def handler(context: dict, request: Request) -> Response:
|
||||||
global tokenizer
|
"""Handle a request to generate code from a prompt."""
|
||||||
|
model = context.get("model")
|
||||||
# Parse out your arguments
|
tokenizer = context.get("tokenizer")
|
||||||
prompt = model_inputs.get('prompt', None)
|
max_new_tokens = request.json.get("max_new_tokens", 512)
|
||||||
if prompt == None:
|
temperature = request.json.get("temperature", 0.7)
|
||||||
return {'message': "No prompt provided"}
|
prompt = request.json.get("prompt")
|
||||||
|
prompt_template=f'''[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:
|
||||||
# Run the model
|
{prompt}
|
||||||
input_ids = tokenizer.encode(prompt, return_tensors='pt').cuda()
|
[/INST]
|
||||||
output = model.generate(
|
'''
|
||||||
input_ids,
|
input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
|
||||||
max_length=100,
|
output = model.generate(inputs=input_ids, temperature=temperature, max_new_tokens=max_new_tokens)
|
||||||
do_sample=True,
|
result = tokenizer.decode(output[0])
|
||||||
top_k=50,
|
return Response(json={"outputs": result}, status=200)
|
||||||
top_p=0.95,
|
|
||||||
num_return_sequences=1,
|
|
||||||
temperature=0.9,
|
|
||||||
early_stopping=True,
|
|
||||||
no_repeat_ngram_size=3,
|
|
||||||
num_beams=5,
|
|
||||||
length_penalty=1.5,
|
|
||||||
repetition_penalty=1.5,
|
|
||||||
bad_words_ids=[[tokenizer.encode(' ', add_prefix_space=True)[0]]]
|
|
||||||
)
|
|
||||||
|
|
||||||
result = tokenizer.decode(output[0], skip_special_tokens=True)
|
|
||||||
# Return the results as a dictionary
|
|
||||||
result = {'output': result}
|
|
||||||
return result
|
|
||||||
```
|
```
|
||||||
|
|
||||||
You can find a full example of a Banana app [here](https://github.com/conceptofmind/serverless-template-palmyra-base/blob/main/app.py).
|
This example is from the `app.py` file in [CodeLlama-7B-Instruct-GPTQ](https://github.com/bananaml/demo-codellama-7b-instruct-gptq).
|
||||||
|
|
||||||
## Wrappers
|
## Wrappers
|
||||||
|
|
||||||
### LLM
|
### LLM
|
||||||
|
|
||||||
There exists an Banana LLM wrapper, which you can access with
|
Within Langchain, there exists a Banana LLM wrapper, which you can access with
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from langchain.llms import Banana
|
from langchain.llms import Banana
|
||||||
```
|
```
|
||||||
|
|
||||||
You need to provide a model key located in the dashboard:
|
You need to provide a model key and model url slug, which you can get from the model's details page in the [Banana.dev dashboard](https://app.banana.dev).
|
||||||
|
|
||||||
```python
|
```python
|
||||||
llm = Banana(model_key="YOUR_MODEL_KEY")
|
llm = Banana(model_key="YOUR_MODEL_KEY", model_url_slug="YOUR_MODEL_URL_SLUG")
|
||||||
```
|
```
|
||||||
|
@ -0,0 +1,22 @@
|
|||||||
|
# Confident AI
|
||||||
|
|
||||||
|
![Confident - Unit Testing for LLMs](https://github.com/confident-ai/deepeval)
|
||||||
|
|
||||||
|
>[DeepEval](https://confident-ai.com) package for unit testing LLMs.
|
||||||
|
> Using Confident, everyone can build robust language models through faster iterations
|
||||||
|
> using both unit testing and integration testing. We provide support for each step in the iteration
|
||||||
|
> from synthetic data creation to testing.
|
||||||
|
|
||||||
|
## Installation and Setup
|
||||||
|
|
||||||
|
First, you'll need to install the `DeepEval` Python package as follows:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install deepeval
|
||||||
|
```
|
||||||
|
|
||||||
|
Afterwards, you can get started in as little as a few lines of code.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain.callbacks import DeepEvalCallback
|
||||||
|
```
|
@ -0,0 +1,80 @@
|
|||||||
|
# Konko
|
||||||
|
This page covers how to run models on Konko within LangChain.
|
||||||
|
|
||||||
|
Konko API is a fully managed API designed to help application developers:
|
||||||
|
|
||||||
|
Select the right LLM(s) for their application
|
||||||
|
Prototype with various open-source and proprietary LLMs
|
||||||
|
Move to production in-line with their security, privacy, throughput, latency SLAs without infrastructure set-up or administration using Konko AI's SOC 2 compliant infrastructure
|
||||||
|
|
||||||
|
## Installation and Setup
|
||||||
|
|
||||||
|
### First you'll need an API key
|
||||||
|
You can request it by messaging [support@konko.ai](mailto:support@konko.ai)
|
||||||
|
|
||||||
|
### Install Konko AI's Python SDK
|
||||||
|
|
||||||
|
#### 1. Enable a Python3.8+ environment
|
||||||
|
|
||||||
|
#### 2. Set API Keys
|
||||||
|
|
||||||
|
##### Option 1: Set Environment Variables
|
||||||
|
|
||||||
|
1. You can set environment variables for
|
||||||
|
1. KONKO_API_KEY (Required)
|
||||||
|
2. OPENAI_API_KEY (Optional)
|
||||||
|
|
||||||
|
2. In your current shell session, use the export command:
|
||||||
|
|
||||||
|
```shell
|
||||||
|
export KONKO_API_KEY={your_KONKO_API_KEY_here}
|
||||||
|
export OPENAI_API_KEY={your_OPENAI_API_KEY_here} #Optional
|
||||||
|
```
|
||||||
|
|
||||||
|
Alternatively, you can add the above lines directly to your shell startup script (such as .bashrc or .bash_profile for Bash shell and .zshrc for Zsh shell) to have them set automatically every time a new shell session starts.
|
||||||
|
|
||||||
|
##### Option 2: Set API Keys Programmatically
|
||||||
|
|
||||||
|
If you prefer to set your API keys directly within your Python script or Jupyter notebook, you can use the following commands:
|
||||||
|
|
||||||
|
```python
|
||||||
|
konko.set_api_key('your_KONKO_API_KEY_here')
|
||||||
|
konko.set_openai_api_key('your_OPENAI_API_KEY_here') # Optional
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 3. Install the SDK
|
||||||
|
|
||||||
|
|
||||||
|
```shell
|
||||||
|
pip install konko
|
||||||
|
```
|
||||||
|
|
||||||
|
#### 4. Verify Installation & Authentication
|
||||||
|
|
||||||
|
```python
|
||||||
|
#Confirm konko has installed successfully
|
||||||
|
import konko
|
||||||
|
#Confirm API keys from Konko and OpenAI are set properly
|
||||||
|
konko.Model.list()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Calling a model
|
||||||
|
|
||||||
|
Find a model on the [Konko Introduction page](https://docs.konko.ai/docs#available-models)
|
||||||
|
|
||||||
|
For example, for this [LLama 2 model](https://docs.konko.ai/docs/meta-llama-2-13b-chat). The model id would be: `"meta-llama/Llama-2-13b-chat-hf"`
|
||||||
|
|
||||||
|
Another way to find the list of models running on the Konko instance is through this [endpoint](https://docs.konko.ai/reference/listmodels).
|
||||||
|
|
||||||
|
From here, we can initialize our model:
|
||||||
|
|
||||||
|
```python
|
||||||
|
chat_instance = ChatKonko(max_tokens=10, model = 'meta-llama/Llama-2-13b-chat-hf')
|
||||||
|
```
|
||||||
|
|
||||||
|
And run it:
|
||||||
|
|
||||||
|
```python
|
||||||
|
msg = HumanMessage(content="Hi")
|
||||||
|
chat_response = chat_instance([msg])
|
||||||
|
```
|
@ -1,20 +1,24 @@
|
|||||||
# ModelScope
|
# ModelScope
|
||||||
|
|
||||||
|
>[ModelScope](https://www.modelscope.cn/home) is a big repository of the models and datasets.
|
||||||
|
|
||||||
This page covers how to use the modelscope ecosystem within LangChain.
|
This page covers how to use the modelscope ecosystem within LangChain.
|
||||||
It is broken into two parts: installation and setup, and then references to specific modelscope wrappers.
|
It is broken into two parts: installation and setup, and then references to specific modelscope wrappers.
|
||||||
|
|
||||||
## Installation and Setup
|
## Installation and Setup
|
||||||
|
|
||||||
* Install the Python SDK with `pip install modelscope`
|
Install the `modelscope` package.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install modelscope
|
||||||
|
```
|
||||||
|
|
||||||
## Wrappers
|
|
||||||
|
|
||||||
### Embeddings
|
## Text Embedding Models
|
||||||
|
|
||||||
There exists a modelscope Embeddings wrapper, which you can access with
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from langchain.embeddings import ModelScopeEmbeddings
|
from langchain.embeddings import ModelScopeEmbeddings
|
||||||
```
|
```
|
||||||
|
|
||||||
For a more detailed walkthrough of this, see [this notebook](/docs/integrations/text_embedding/modelscope_hub.html)
|
For a more detailed walkthrough of this, see [this notebook](/docs/integrations/text_embedding/modelscope_hub)
|
||||||
|
@ -1,17 +1,31 @@
|
|||||||
# NLPCloud
|
# NLPCloud
|
||||||
|
|
||||||
This page covers how to use the NLPCloud ecosystem within LangChain.
|
>[NLP Cloud](https://docs.nlpcloud.com/#introduction) is an artificial intelligence platform that allows you to use the most advanced AI engines, and even train your own engines with your own data.
|
||||||
It is broken into two parts: installation and setup, and then references to specific NLPCloud wrappers.
|
|
||||||
|
|
||||||
## Installation and Setup
|
## Installation and Setup
|
||||||
- Install the Python SDK with `pip install nlpcloud`
|
|
||||||
|
- Install the `nlpcloud` package.
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install nlpcloud
|
||||||
|
```
|
||||||
|
|
||||||
- Get an NLPCloud api key and set it as an environment variable (`NLPCLOUD_API_KEY`)
|
- Get an NLPCloud api key and set it as an environment variable (`NLPCLOUD_API_KEY`)
|
||||||
|
|
||||||
## Wrappers
|
|
||||||
|
|
||||||
### LLM
|
## LLM
|
||||||
|
|
||||||
|
See a [usage example](/docs/integrations/llms/nlpcloud).
|
||||||
|
|
||||||
There exists an NLPCloud LLM wrapper, which you can access with
|
|
||||||
```python
|
```python
|
||||||
from langchain.llms import NLPCloud
|
from langchain.llms import NLPCloud
|
||||||
```
|
```
|
||||||
|
|
||||||
|
## Text Embedding Models
|
||||||
|
|
||||||
|
See a [usage example](/docs/integrations/text_embedding/nlp_cloud)
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain.embeddings import NLPCloudEmbeddings
|
||||||
|
```
|
||||||
|
@ -0,0 +1,126 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# NucliaDB\n",
|
||||||
|
"\n",
|
||||||
|
"You can use a local NucliaDB instance or use [Nuclia Cloud](https://nuclia.cloud).\n",
|
||||||
|
"\n",
|
||||||
|
"When using a local instance, you need a Nuclia Understanding API key, so your texts are properly vectorized and indexed. You can get a key by creating a free account at [https://nuclia.cloud](https://nuclia.cloud), and then [create a NUA key](https://docs.nuclia.dev/docs/docs/using/understanding/intro)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"#!pip install langchain nuclia"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Usage with nuclia.cloud"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.vectorstores.nucliadb import NucliaDB\n",
|
||||||
|
"API_KEY = \"YOUR_API_KEY\"\n",
|
||||||
|
"\n",
|
||||||
|
"ndb = NucliaDB(knowledge_box=\"YOUR_KB_ID\", local=False, api_key=API_KEY)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Usage with a local instance\n",
|
||||||
|
"\n",
|
||||||
|
"Note: By default `backend` is set to `http://localhost:8080`."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.vectorstores.nucliadb import NucliaDB\n",
|
||||||
|
"\n",
|
||||||
|
"ndb = NucliaDB(knowledge_box=\"YOUR_KB_ID\", local=True, backend=\"http://my-local-server\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Add and delete texts to your Knowledge Box"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"ids = ndb.add_texts([\"This is a new test\", \"This is a second test\"])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"ndb.delete(ids=ids)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Search in your Knowledge Box"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"results = ndb.similarity_search(\"Who was inspired by Ada Lovelace?\")\n",
|
||||||
|
"print(res.page_content)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
@ -0,0 +1,472 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "13afcae7",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Redis self-querying \n",
|
||||||
|
"\n",
|
||||||
|
">[Redis](https://redis.com) is an open-source key-value store that can be used as a cache, message broker, database, vector database and more.\n",
|
||||||
|
"\n",
|
||||||
|
"In the notebook we'll demo the `SelfQueryRetriever` wrapped around a Redis vector store. "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "68e75fb9",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Creating a Redis vector store\n",
|
||||||
|
"First we'll want to create a Redis vector store and seed it with some data. We've created a small demo set of documents that contain summaries of movies.\n",
|
||||||
|
"\n",
|
||||||
|
"**Note:** The self-query retriever requires you to have `lark` installed (`pip install lark`) along with integration-specific requirements."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "63a8af5b",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# !pip install redis redisvl openai tiktoken lark"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "83811610-7df3-4ede-b268-68a6a83ba9e2",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We want to use `OpenAIEmbeddings` so we have to get the OpenAI API Key."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "dd01b61b-7d32-4a55-85d6-b2d2d4f18840",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import getpass\n",
|
||||||
|
"\n",
|
||||||
|
"os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "cb4a5787",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.schema import Document\n",
|
||||||
|
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||||
|
"from langchain.vectorstores import Redis\n",
|
||||||
|
"\n",
|
||||||
|
"embeddings = OpenAIEmbeddings()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "bcbe04d9",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = [\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"A bunch of scientists bring back dinosaurs and mayhem breaks loose\",\n",
|
||||||
|
" metadata={\"year\": 1993, \"rating\": 7.7, \"director\": \"Steven Spielberg\", \"genre\": \"science fiction\"},\n",
|
||||||
|
" ),\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"Leo DiCaprio gets lost in a dream within a dream within a dream within a ...\",\n",
|
||||||
|
" metadata={\"year\": 2010, \"director\": \"Christopher Nolan\", \"genre\": \"science fiction\", \"rating\": 8.2},\n",
|
||||||
|
" ),\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea\",\n",
|
||||||
|
" metadata={\"year\": 2006, \"director\": \"Satoshi Kon\", \"genre\": \"science fiction\", \"rating\": 8.6},\n",
|
||||||
|
" ),\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n",
|
||||||
|
" metadata={\"year\": 2019, \"director\": \"Greta Gerwig\", \"genre\": \"drama\", \"rating\": 8.3},\n",
|
||||||
|
" ),\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"Toys come alive and have a blast doing so\",\n",
|
||||||
|
" metadata={\"year\": 1995, \"director\": \"John Lasseter\", \"genre\": \"animated\", \"rating\": 9.1,},\n",
|
||||||
|
" ),\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"Three men walk into the Zone, three men walk out of the Zone\",\n",
|
||||||
|
" metadata={\n",
|
||||||
|
" \"year\": 1979,\n",
|
||||||
|
" \"rating\": 9.9,\n",
|
||||||
|
" \"director\": \"Andrei Tarkovsky\",\n",
|
||||||
|
" \"genre\": \"science fiction\",\n",
|
||||||
|
" },\n",
|
||||||
|
" ),\n",
|
||||||
|
"]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "393aff3b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"`index_schema` does not match generated metadata schema.\n",
|
||||||
|
"If you meant to manually override the schema, please ignore this message.\n",
|
||||||
|
"index_schema: {'tag': [{'name': 'genre'}], 'text': [{'name': 'director'}], 'numeric': [{'name': 'year'}, {'name': 'rating'}]}\n",
|
||||||
|
"generated_schema: {'text': [{'name': 'director'}, {'name': 'genre'}], 'numeric': [{'name': 'year'}, {'name': 'rating'}], 'tag': []}\n",
|
||||||
|
"\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"index_schema = {\n",
|
||||||
|
" \"tag\": [{\"name\": \"genre\"}],\n",
|
||||||
|
" \"text\": [{\"name\": \"director\"}],\n",
|
||||||
|
" \"numeric\": [{\"name\": \"year\"}, {\"name\": \"rating\"}],\n",
|
||||||
|
"}\n",
|
||||||
|
"\n",
|
||||||
|
"vectorstore = Redis.from_documents(\n",
|
||||||
|
" docs, \n",
|
||||||
|
" embeddings, \n",
|
||||||
|
" redis_url=\"redis://localhost:6379\",\n",
|
||||||
|
" index_name=\"movie_reviews\",\n",
|
||||||
|
" index_schema=index_schema,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "5ecaab6d",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Creating our self-querying retriever\n",
|
||||||
|
"Now we can instantiate our retriever. To do this we'll need to provide some information upfront about the metadata fields that our documents support and a short description of the document contents."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "86e34dbf",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.llms import OpenAI\n",
|
||||||
|
"from langchain.retrievers.self_query.base import SelfQueryRetriever\n",
|
||||||
|
"from langchain.chains.query_constructor.base import AttributeInfo\n",
|
||||||
|
"\n",
|
||||||
|
"metadata_field_info = [\n",
|
||||||
|
" AttributeInfo(\n",
|
||||||
|
" name=\"genre\",\n",
|
||||||
|
" description=\"The genre of the movie\",\n",
|
||||||
|
" type=\"string or list[string]\",\n",
|
||||||
|
" ),\n",
|
||||||
|
" AttributeInfo(\n",
|
||||||
|
" name=\"year\",\n",
|
||||||
|
" description=\"The year the movie was released\",\n",
|
||||||
|
" type=\"integer\",\n",
|
||||||
|
" ),\n",
|
||||||
|
" AttributeInfo(\n",
|
||||||
|
" name=\"director\",\n",
|
||||||
|
" description=\"The name of the movie director\",\n",
|
||||||
|
" type=\"string\",\n",
|
||||||
|
" ),\n",
|
||||||
|
" AttributeInfo(\n",
|
||||||
|
" name=\"rating\", description=\"A 1-10 rating for the movie\", type=\"float\"\n",
|
||||||
|
" ),\n",
|
||||||
|
"]\n",
|
||||||
|
"document_content_description = \"Brief summary of a movie\"\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "ea1126cb",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"llm = OpenAI(temperature=0)\n",
|
||||||
|
"retriever = SelfQueryRetriever.from_llm(\n",
|
||||||
|
" llm, \n",
|
||||||
|
" vectorstore, \n",
|
||||||
|
" document_content_description, \n",
|
||||||
|
" metadata_field_info, \n",
|
||||||
|
" verbose=True\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "ea9df8d4",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Testing it out\n",
|
||||||
|
"And now we can try actually using our retriever!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "38a126e9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/Users/bagatur/langchain/libs/langchain/langchain/chains/llm.py:278: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n",
|
||||||
|
" warnings.warn(\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query='dinosaur' filter=None limit=None\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'id': 'doc:movie_reviews:7b5481d753bc4135851b66fa61def7fb', 'director': 'Steven Spielberg', 'genre': 'science fiction', 'year': '1993', 'rating': '7.7'}),\n",
|
||||||
|
" Document(page_content='Toys come alive and have a blast doing so', metadata={'id': 'doc:movie_reviews:9e4e84daa0374941a6aa4274e9bbb607', 'director': 'John Lasseter', 'genre': 'animated', 'year': '1995', 'rating': '9.1'}),\n",
|
||||||
|
" Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'id': 'doc:movie_reviews:2cc66f38bfbd438eb3a045d90a1a4088', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'year': '1979', 'rating': '9.9'}),\n",
|
||||||
|
" Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'id': 'doc:movie_reviews:edf567b1d5334e02b2a4c692d853c80c', 'director': 'Satoshi Kon', 'genre': 'science fiction', 'year': '2006', 'rating': '8.6'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example only specifies a relevant query\n",
|
||||||
|
"retriever.get_relevant_documents(\"What are some movies about dinosaurs\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "fc3f1e6e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query=' ' filter=Comparison(comparator=<Comparator.GT: 'gt'>, attribute='rating', value=8.4) limit=None\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='Toys come alive and have a blast doing so', metadata={'id': 'doc:movie_reviews:9e4e84daa0374941a6aa4274e9bbb607', 'director': 'John Lasseter', 'genre': 'animated', 'year': '1995', 'rating': '9.1'}),\n",
|
||||||
|
" Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'id': 'doc:movie_reviews:2cc66f38bfbd438eb3a045d90a1a4088', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'year': '1979', 'rating': '9.9'}),\n",
|
||||||
|
" Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'id': 'doc:movie_reviews:edf567b1d5334e02b2a4c692d853c80c', 'director': 'Satoshi Kon', 'genre': 'science fiction', 'year': '2006', 'rating': '8.6'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example only specifies a filter\n",
|
||||||
|
"retriever.get_relevant_documents(\"I want to watch a movie rated higher than 8.4\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "b19d4da0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query='women' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='director', value='Greta Gerwig') limit=None\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them', metadata={'id': 'doc:movie_reviews:bb899807b93c442083fd45e75a4779d5', 'director': 'Greta Gerwig', 'genre': 'drama', 'year': '2019', 'rating': '8.3'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example specifies a query and a filter\n",
|
||||||
|
"retriever.get_relevant_documents(\"Has Greta Gerwig directed any movies about women\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "f900e40e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.GTE: 'gte'>, attribute='rating', value=8.5), Comparison(comparator=<Comparator.CONTAIN: 'contain'>, attribute='genre', value='science fiction')]) limit=None\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'id': 'doc:movie_reviews:2cc66f38bfbd438eb3a045d90a1a4088', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'year': '1979', 'rating': '9.9'}),\n",
|
||||||
|
" Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'id': 'doc:movie_reviews:edf567b1d5334e02b2a4c692d853c80c', 'director': 'Satoshi Kon', 'genre': 'science fiction', 'year': '2006', 'rating': '8.6'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example specifies a composite filter\n",
|
||||||
|
"retriever.get_relevant_documents(\n",
|
||||||
|
" \"What's a highly rated (above 8.5) science fiction film?\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "12a51522",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query='toys' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.GT: 'gt'>, attribute='year', value=1990), Comparison(comparator=<Comparator.LT: 'lt'>, attribute='year', value=2005), Comparison(comparator=<Comparator.CONTAIN: 'contain'>, attribute='genre', value='animated')]) limit=None\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='Toys come alive and have a blast doing so', metadata={'id': 'doc:movie_reviews:9e4e84daa0374941a6aa4274e9bbb607', 'director': 'John Lasseter', 'genre': 'animated', 'year': '1995', 'rating': '9.1'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 12,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example specifies a query and composite filter\n",
|
||||||
|
"retriever.get_relevant_documents(\n",
|
||||||
|
" \"What's a movie after 1990 but before 2005 that's all about toys, and preferably is animated\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "39bd1de1-b9fe-4a98-89da-58d8a7a6ae51",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Filter k\n",
|
||||||
|
"\n",
|
||||||
|
"We can also use the self query retriever to specify `k`: the number of documents to fetch.\n",
|
||||||
|
"\n",
|
||||||
|
"We can do this by passing `enable_limit=True` to the constructor."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 13,
|
||||||
|
"id": "bff36b88-b506-4877-9c63-e5a1a8d78e64",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"retriever = SelfQueryRetriever.from_llm(\n",
|
||||||
|
" llm,\n",
|
||||||
|
" vectorstore,\n",
|
||||||
|
" document_content_description,\n",
|
||||||
|
" metadata_field_info,\n",
|
||||||
|
" enable_limit=True,\n",
|
||||||
|
" verbose=True,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 14,
|
||||||
|
"id": "2758d229-4f97-499c-819f-888acaf8ee10",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query='dinosaur' filter=None limit=2\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'id': 'doc:movie_reviews:7b5481d753bc4135851b66fa61def7fb', 'director': 'Steven Spielberg', 'genre': 'science fiction', 'year': '1993', 'rating': '7.7'}),\n",
|
||||||
|
" Document(page_content='Toys come alive and have a blast doing so', metadata={'id': 'doc:movie_reviews:9e4e84daa0374941a6aa4274e9bbb607', 'director': 'John Lasseter', 'genre': 'animated', 'year': '1995', 'rating': '9.1'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 14,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example only specifies a relevant query\n",
|
||||||
|
"retriever.get_relevant_documents(\"what are two movies about dinosaurs\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "poetry-venv",
|
||||||
|
"language": "python",
|
||||||
|
"name": "poetry-venv"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,587 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "13afcae7",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Supabase Vector self-querying \n",
|
||||||
|
"\n",
|
||||||
|
">[Supabase](https://supabase.com/docs) is an open source `Firebase` alternative. \n",
|
||||||
|
"> `Supabase` is built on top of `PostgreSQL`, which offers strong `SQL` \n",
|
||||||
|
"> querying capabilities and enables a simple interface with already-existing tools and frameworks.\n",
|
||||||
|
"\n",
|
||||||
|
">[PostgreSQL](https://en.wikipedia.org/wiki/PostgreSQL) also known as `Postgres`,\n",
|
||||||
|
"> is a free and open-source relational database management system (RDBMS) \n",
|
||||||
|
"> emphasizing extensibility and `SQL` compliance.\n",
|
||||||
|
"\n",
|
||||||
|
"In the notebook we'll demo the `SelfQueryRetriever` wrapped around a Supabase vector store.\n",
|
||||||
|
"\n",
|
||||||
|
"Specifically we will:\n",
|
||||||
|
"1. Create a Supabase database\n",
|
||||||
|
"2. Enable the `pgvector` extension\n",
|
||||||
|
"3. Create a `documents` table and `match_documents` function that will be used by `SupabaseVectorStore`\n",
|
||||||
|
"4. Load sample documents into the vector store (database table)\n",
|
||||||
|
"5. Build and test a self-querying retriever"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "347935ad",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Setup Supabase Database\n",
|
||||||
|
"\n",
|
||||||
|
"1. Head over to https://database.new to provision your Supabase database.\n",
|
||||||
|
"2. In the studio, jump to the [SQL editor](https://supabase.com/dashboard/project/_/sql/new) and run the following script to enable `pgvector` and setup your database as a vector store:\n",
|
||||||
|
" ```sql\n",
|
||||||
|
" -- Enable the pgvector extension to work with embedding vectors\n",
|
||||||
|
" create extension if not exists vector;\n",
|
||||||
|
"\n",
|
||||||
|
" -- Create a table to store your documents\n",
|
||||||
|
" create table\n",
|
||||||
|
" documents (\n",
|
||||||
|
" id uuid primary key,\n",
|
||||||
|
" content text, -- corresponds to Document.pageContent\n",
|
||||||
|
" metadata jsonb, -- corresponds to Document.metadata\n",
|
||||||
|
" embedding vector (1536) -- 1536 works for OpenAI embeddings, change if needed\n",
|
||||||
|
" );\n",
|
||||||
|
"\n",
|
||||||
|
" -- Create a function to search for documents\n",
|
||||||
|
" create function match_documents (\n",
|
||||||
|
" query_embedding vector (1536),\n",
|
||||||
|
" filter jsonb default '{}'\n",
|
||||||
|
" ) returns table (\n",
|
||||||
|
" id uuid,\n",
|
||||||
|
" content text,\n",
|
||||||
|
" metadata jsonb,\n",
|
||||||
|
" similarity float\n",
|
||||||
|
" ) language plpgsql as $$\n",
|
||||||
|
" #variable_conflict use_column\n",
|
||||||
|
" begin\n",
|
||||||
|
" return query\n",
|
||||||
|
" select\n",
|
||||||
|
" id,\n",
|
||||||
|
" content,\n",
|
||||||
|
" metadata,\n",
|
||||||
|
" 1 - (documents.embedding <=> query_embedding) as similarity\n",
|
||||||
|
" from documents\n",
|
||||||
|
" where metadata @> filter\n",
|
||||||
|
" order by documents.embedding <=> query_embedding;\n",
|
||||||
|
" end;\n",
|
||||||
|
" $$;\n",
|
||||||
|
" ```"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "68e75fb9",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Creating a Supabase vector store\n",
|
||||||
|
"Next we'll want to create a Supabase vector store and seed it with some data. We've created a small demo set of documents that contain summaries of movies.\n",
|
||||||
|
"\n",
|
||||||
|
"Be sure to install the latest version of `langchain`:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "78546fd7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%pip install langchain"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "e06df198",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The self-query retriever requires you to have `lark` installed:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "63a8af5b",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%pip install lark"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "114f768f",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We also need the `openai` and `supabase` packages:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "434ae558",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%pip install openai"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "22431060-52c4-48a7-a97b-9f542b8b0928",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%pip install supabase==1.0.0"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "83811610-7df3-4ede-b268-68a6a83ba9e2",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Since we are using `SupabaseVectorStore` and `OpenAIEmbeddings`, we have to load their API keys.\n",
|
||||||
|
"\n",
|
||||||
|
"- To find your `SUPABASE_URL` and `SUPABASE_SERVICE_KEY`, head to your Supabase project's [API settings](https://supabase.com/dashboard/project/_/settings/api).\n",
|
||||||
|
" - `SUPABASE_URL` corresponds to the Project URL\n",
|
||||||
|
" - `SUPABASE_SERVICE_KEY` corresponds to the `service_role` API key\n",
|
||||||
|
"\n",
|
||||||
|
"- To get your `OPENAI_API_KEY`, navigate to [API keys](https://platform.openai.com/account/api-keys) on your OpenAI account and create a new secret key."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "dd01b61b-7d32-4a55-85d6-b2d2d4f18840",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"import getpass\n",
|
||||||
|
"\n",
|
||||||
|
"os.environ[\"SUPABASE_URL\"] = getpass.getpass(\"Supabase URL:\")\n",
|
||||||
|
"os.environ[\"SUPABASE_SERVICE_KEY\"] = getpass.getpass(\"Supabase Service Key:\")\n",
|
||||||
|
"os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "3aaf5075",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"_Optional:_ If you're storing your Supabase and OpenAI API keys in a `.env` file, you can load them with [`dotenv`](https://github.com/theskumar/python-dotenv)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "e0089221",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"%pip install python-dotenv"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "3d56c5ef",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from dotenv import load_dotenv\n",
|
||||||
|
"\n",
|
||||||
|
"load_dotenv()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "f6dd9aef",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"First we'll create a Supabase client and instantiate a OpenAI embeddings class."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "cb4a5787",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"import os\n",
|
||||||
|
"from supabase.client import Client, create_client\n",
|
||||||
|
"from langchain.schema import Document\n",
|
||||||
|
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
|
||||||
|
"from langchain.vectorstores import SupabaseVectorStore\n",
|
||||||
|
"\n",
|
||||||
|
"supabase_url = os.environ.get(\"SUPABASE_URL\")\n",
|
||||||
|
"supabase_key = os.environ.get(\"SUPABASE_SERVICE_KEY\")\n",
|
||||||
|
"supabase: Client = create_client(supabase_url, supabase_key)\n",
|
||||||
|
"\n",
|
||||||
|
"embeddings = OpenAIEmbeddings()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "0fca9b0b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Next let's create our documents."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "bcbe04d9",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = [\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"A bunch of scientists bring back dinosaurs and mayhem breaks loose\",\n",
|
||||||
|
" metadata={\"year\": 1993, \"rating\": 7.7, \"genre\": \"science fiction\"},\n",
|
||||||
|
" ),\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"Leo DiCaprio gets lost in a dream within a dream within a dream within a ...\",\n",
|
||||||
|
" metadata={\"year\": 2010, \"director\": \"Christopher Nolan\", \"rating\": 8.2},\n",
|
||||||
|
" ),\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea\",\n",
|
||||||
|
" metadata={\"year\": 2006, \"director\": \"Satoshi Kon\", \"rating\": 8.6},\n",
|
||||||
|
" ),\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n",
|
||||||
|
" metadata={\"year\": 2019, \"director\": \"Greta Gerwig\", \"rating\": 8.3},\n",
|
||||||
|
" ),\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"Toys come alive and have a blast doing so\",\n",
|
||||||
|
" metadata={\"year\": 1995, \"genre\": \"animated\"},\n",
|
||||||
|
" ),\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"Three men walk into the Zone, three men walk out of the Zone\",\n",
|
||||||
|
" metadata={\n",
|
||||||
|
" \"year\": 1979,\n",
|
||||||
|
" \"rating\": 9.9,\n",
|
||||||
|
" \"director\": \"Andrei Tarkovsky\",\n",
|
||||||
|
" \"genre\": \"science fiction\",\n",
|
||||||
|
" \"rating\": 9.9,\n",
|
||||||
|
" },\n",
|
||||||
|
" ),\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"vectorstore = SupabaseVectorStore.from_documents(docs, embeddings, client=supabase, table_name=\"documents\", query_name=\"match_documents\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "5ecaab6d",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Creating our self-querying retriever\n",
|
||||||
|
"Now we can instantiate our retriever. To do this we'll need to provide some information upfront about the metadata fields that our documents support and a short description of the document contents."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "86e34dbf",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.llms import OpenAI\n",
|
||||||
|
"from langchain.retrievers.self_query.base import SelfQueryRetriever\n",
|
||||||
|
"from langchain.chains.query_constructor.base import AttributeInfo\n",
|
||||||
|
"\n",
|
||||||
|
"metadata_field_info = [\n",
|
||||||
|
" AttributeInfo(\n",
|
||||||
|
" name=\"genre\",\n",
|
||||||
|
" description=\"The genre of the movie\",\n",
|
||||||
|
" type=\"string or list[string]\",\n",
|
||||||
|
" ),\n",
|
||||||
|
" AttributeInfo(\n",
|
||||||
|
" name=\"year\",\n",
|
||||||
|
" description=\"The year the movie was released\",\n",
|
||||||
|
" type=\"integer\",\n",
|
||||||
|
" ),\n",
|
||||||
|
" AttributeInfo(\n",
|
||||||
|
" name=\"director\",\n",
|
||||||
|
" description=\"The name of the movie director\",\n",
|
||||||
|
" type=\"string\",\n",
|
||||||
|
" ),\n",
|
||||||
|
" AttributeInfo(\n",
|
||||||
|
" name=\"rating\", description=\"A 1-10 rating for the movie\", type=\"float\"\n",
|
||||||
|
" ),\n",
|
||||||
|
"]\n",
|
||||||
|
"document_content_description = \"Brief summary of a movie\"\n",
|
||||||
|
"llm = OpenAI(temperature=0)\n",
|
||||||
|
"retriever = SelfQueryRetriever.from_llm(\n",
|
||||||
|
" llm, vectorstore, document_content_description, metadata_field_info, verbose=True\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "ea9df8d4",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Testing it out\n",
|
||||||
|
"And now we can try actually using our retriever!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "38a126e9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query='dinosaur' filter=None limit=None\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'year': 1993, 'genre': 'science fiction', 'rating': 7.7}),\n",
|
||||||
|
" Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'}),\n",
|
||||||
|
" Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'genre': 'science fiction', 'rating': 9.9, 'director': 'Andrei Tarkovsky'}),\n",
|
||||||
|
" Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'year': 2006, 'rating': 8.6, 'director': 'Satoshi Kon'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example only specifies a relevant query\n",
|
||||||
|
"retriever.get_relevant_documents(\"What are some movies about dinosaurs\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "fc3f1e6e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query=' ' filter=Comparison(comparator=<Comparator.GT: 'gt'>, attribute='rating', value=8.5) limit=None\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'genre': 'science fiction', 'rating': 9.9, 'director': 'Andrei Tarkovsky'}),\n",
|
||||||
|
" Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'year': 2006, 'rating': 8.6, 'director': 'Satoshi Kon'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example only specifies a filter\n",
|
||||||
|
"retriever.get_relevant_documents(\"I want to watch a movie rated higher than 8.5\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "b19d4da0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query='women' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='director', value='Greta Gerwig') limit=None\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them', metadata={'year': 2019, 'rating': 8.3, 'director': 'Greta Gerwig'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example specifies a query and a filter\n",
|
||||||
|
"retriever.get_relevant_documents(\"Has Greta Gerwig directed any movies about women?\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "f900e40e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.GTE: 'gte'>, attribute='rating', value=8.5), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='genre', value='science fiction')]) limit=None\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'genre': 'science fiction', 'rating': 9.9, 'director': 'Andrei Tarkovsky'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example specifies a composite filter\n",
|
||||||
|
"retriever.get_relevant_documents(\n",
|
||||||
|
" \"What's a highly rated (above 8.5) science fiction film?\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "12a51522",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query='toys' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.GT: 'gt'>, attribute='year', value=1990), Comparison(comparator=<Comparator.LTE: 'lte'>, attribute='year', value=2005), Comparison(comparator=<Comparator.LIKE: 'like'>, attribute='genre', value='animated')]) limit=None\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example specifies a query and composite filter\n",
|
||||||
|
"retriever.get_relevant_documents(\n",
|
||||||
|
" \"What's a movie after 1990 but before (or on) 2005 that's all about toys, and preferably is animated\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "39bd1de1-b9fe-4a98-89da-58d8a7a6ae51",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Filter k\n",
|
||||||
|
"\n",
|
||||||
|
"We can also use the self query retriever to specify `k`: the number of documents to fetch.\n",
|
||||||
|
"\n",
|
||||||
|
"We can do this by passing `enable_limit=True` to the constructor."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "bff36b88-b506-4877-9c63-e5a1a8d78e64",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"retriever = SelfQueryRetriever.from_llm(\n",
|
||||||
|
" llm,\n",
|
||||||
|
" vectorstore,\n",
|
||||||
|
" document_content_description,\n",
|
||||||
|
" metadata_field_info,\n",
|
||||||
|
" enable_limit=True,\n",
|
||||||
|
" verbose=True,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "2758d229-4f97-499c-819f-888acaf8ee10",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query='dinosaur' filter=None limit=2\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'year': 1993, 'genre': 'science fiction', 'rating': 7.7}),\n",
|
||||||
|
" Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example only specifies a relevant query\n",
|
||||||
|
"retriever.get_relevant_documents(\"what are two movies about dinosaurs\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.12"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,440 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "13afcae7",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Vectara self-querying \n",
|
||||||
|
"\n",
|
||||||
|
">[Vectara](https://docs.vectara.com/docs/) is a GenAI platform for developers. It provides a simple API to build Grounded Generation (aka Retrieval-augmented-generation) applications.\n",
|
||||||
|
"\n",
|
||||||
|
"In the notebook we'll demo the `SelfQueryRetriever` wrapped around a Vectara vector store. "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "68e75fb9",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Setup\n",
|
||||||
|
"\n",
|
||||||
|
"You will need a Vectara account to use Vectara with LangChain. To get started, use the following steps (see our [quickstart](https://docs.vectara.com/docs/quickstart) guide):\n",
|
||||||
|
"1. [Sign up](https://console.vectara.com/signup) for a Vectara account if you don't already have one. Once you have completed your sign up you will have a Vectara customer ID. You can find your customer ID by clicking on your name, on the top-right of the Vectara console window.\n",
|
||||||
|
"2. Within your account you can create one or more corpora. Each corpus represents an area that stores text data upon ingest from input documents. To create a corpus, use the **\"Create Corpus\"** button. You then provide a name to your corpus as well as a description. Optionally you can define filtering attributes and apply some advanced options. If you click on your created corpus, you can see its name and corpus ID right on the top.\n",
|
||||||
|
"3. Next you'll need to create API keys to access the corpus. Click on the **\"Authorization\"** tab in the corpus view and then the **\"Create API Key\"** button. Give your key a name, and choose whether you want query only or query+index for your key. Click \"Create\" and you now have an active API key. Keep this key confidential. \n",
|
||||||
|
"\n",
|
||||||
|
"To use LangChain with Vectara, you'll need to have these three values: customer ID, corpus ID and api_key.\n",
|
||||||
|
"You can provide those to LangChain in two ways:\n",
|
||||||
|
"\n",
|
||||||
|
"1. Include in your environment these three variables: `VECTARA_CUSTOMER_ID`, `VECTARA_CORPUS_ID` and `VECTARA_API_KEY`.\n",
|
||||||
|
"\n",
|
||||||
|
"> For example, you can set these variables using os.environ and getpass as follows:\n",
|
||||||
|
"\n",
|
||||||
|
"```python\n",
|
||||||
|
"import os\n",
|
||||||
|
"import getpass\n",
|
||||||
|
"\n",
|
||||||
|
"os.environ[\"VECTARA_CUSTOMER_ID\"] = getpass.getpass(\"Vectara Customer ID:\")\n",
|
||||||
|
"os.environ[\"VECTARA_CORPUS_ID\"] = getpass.getpass(\"Vectara Corpus ID:\")\n",
|
||||||
|
"os.environ[\"VECTARA_API_KEY\"] = getpass.getpass(\"Vectara API Key:\")\n",
|
||||||
|
"```\n",
|
||||||
|
"\n",
|
||||||
|
"1. Provide them as arguments when creating the Vectara vectorstore object:\n",
|
||||||
|
"\n",
|
||||||
|
"```python\n",
|
||||||
|
"vectorstore = Vectara(\n",
|
||||||
|
" vectara_customer_id=vectara_customer_id,\n",
|
||||||
|
" vectara_corpus_id=vectara_corpus_id,\n",
|
||||||
|
" vectara_api_key=vectara_api_key\n",
|
||||||
|
" )\n",
|
||||||
|
"```\n",
|
||||||
|
"\n",
|
||||||
|
"**Note:** The self-query retriever requires you to have `lark` installed (`pip install lark`). "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "742ac16d",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Connecting to Vectara from LangChain\n",
|
||||||
|
"\n",
|
||||||
|
"In this example, we assume that you've created an account and a corpus, and added your VECTARA_CUSTOMER_ID, VECTARA_CORPUS_ID and VECTARA_API_KEY (created with permissions for both indexing and query) as environment variables.\n",
|
||||||
|
"\n",
|
||||||
|
"The corpus has 4 fields defined as metadata for filtering: year, director, rating, and genre\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "cb4a5787",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.embeddings import FakeEmbeddings\n",
|
||||||
|
"from langchain.schema import Document\n",
|
||||||
|
"from langchain.text_splitter import CharacterTextSplitter\n",
|
||||||
|
"from langchain.vectorstores import Vectara\n",
|
||||||
|
"from langchain.document_loaders import TextLoader\n",
|
||||||
|
"\n",
|
||||||
|
"from langchain.llms import OpenAI\n",
|
||||||
|
"from langchain.chains import ConversationalRetrievalChain\n",
|
||||||
|
"from langchain.retrievers.self_query.base import SelfQueryRetriever\n",
|
||||||
|
"from langchain.chains.query_constructor.base import AttributeInfo\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "bcbe04d9",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"docs = [\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"A bunch of scientists bring back dinosaurs and mayhem breaks loose\",\n",
|
||||||
|
" metadata={\"year\": 1993, \"rating\": 7.7, \"genre\": \"science fiction\"},\n",
|
||||||
|
" ),\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"Leo DiCaprio gets lost in a dream within a dream within a dream within a ...\",\n",
|
||||||
|
" metadata={\"year\": 2010, \"director\": \"Christopher Nolan\", \"rating\": 8.2},\n",
|
||||||
|
" ),\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea\",\n",
|
||||||
|
" metadata={\"year\": 2006, \"director\": \"Satoshi Kon\", \"rating\": 8.6},\n",
|
||||||
|
" ),\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n",
|
||||||
|
" metadata={\"year\": 2019, \"director\": \"Greta Gerwig\", \"rating\": 8.3},\n",
|
||||||
|
" ),\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"Toys come alive and have a blast doing so\",\n",
|
||||||
|
" metadata={\"year\": 1995, \"genre\": \"animated\"},\n",
|
||||||
|
" ),\n",
|
||||||
|
" Document(\n",
|
||||||
|
" page_content=\"Three men walk into the Zone, three men walk out of the Zone\",\n",
|
||||||
|
" metadata={\n",
|
||||||
|
" \"year\": 1979,\n",
|
||||||
|
" \"rating\": 9.9,\n",
|
||||||
|
" \"director\": \"Andrei Tarkovsky\",\n",
|
||||||
|
" \"genre\": \"science fiction\",\n",
|
||||||
|
" },\n",
|
||||||
|
" ),\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"vectara = Vectara()\n",
|
||||||
|
"for doc in docs:\n",
|
||||||
|
" vectara.add_texts([doc.page_content], embedding=FakeEmbeddings(size=768), doc_metadata=doc.metadata)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "5ecaab6d",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Creating our self-querying retriever\n",
|
||||||
|
"Now we can instantiate our retriever. To do this we'll need to provide some information upfront about the metadata fields that our documents support and a short description of the document contents."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "86e34dbf",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.llms import OpenAI\n",
|
||||||
|
"from langchain.retrievers.self_query.base import SelfQueryRetriever\n",
|
||||||
|
"from langchain.chains.query_constructor.base import AttributeInfo\n",
|
||||||
|
"\n",
|
||||||
|
"metadata_field_info = [\n",
|
||||||
|
" AttributeInfo(\n",
|
||||||
|
" name=\"genre\",\n",
|
||||||
|
" description=\"The genre of the movie\",\n",
|
||||||
|
" type=\"string or list[string]\",\n",
|
||||||
|
" ),\n",
|
||||||
|
" AttributeInfo(\n",
|
||||||
|
" name=\"year\",\n",
|
||||||
|
" description=\"The year the movie was released\",\n",
|
||||||
|
" type=\"integer\",\n",
|
||||||
|
" ),\n",
|
||||||
|
" AttributeInfo(\n",
|
||||||
|
" name=\"director\",\n",
|
||||||
|
" description=\"The name of the movie director\",\n",
|
||||||
|
" type=\"string\",\n",
|
||||||
|
" ),\n",
|
||||||
|
" AttributeInfo(\n",
|
||||||
|
" name=\"rating\", description=\"A 1-10 rating for the movie\", type=\"float\"\n",
|
||||||
|
" ),\n",
|
||||||
|
"]\n",
|
||||||
|
"document_content_description = \"Brief summary of a movie\"\n",
|
||||||
|
"llm = OpenAI(temperature=0)\n",
|
||||||
|
"retriever = SelfQueryRetriever.from_llm(\n",
|
||||||
|
" llm, vectara, document_content_description, metadata_field_info, verbose=True\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "ea9df8d4",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Testing it out\n",
|
||||||
|
"And now we can try actually using our retriever!"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "38a126e9",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/Users/ofer/dev/langchain/libs/langchain/langchain/chains/llm.py:278: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n",
|
||||||
|
" warnings.warn(\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query='dinosaur' filter=None limit=None\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'lang': 'eng', 'offset': '0', 'len': '66', 'year': '1993', 'rating': '7.7', 'genre': 'science fiction', 'source': 'langchain'}),\n",
|
||||||
|
" Document(page_content='Toys come alive and have a blast doing so', metadata={'lang': 'eng', 'offset': '0', 'len': '41', 'year': '1995', 'genre': 'animated', 'source': 'langchain'}),\n",
|
||||||
|
" Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'lang': 'eng', 'offset': '0', 'len': '60', 'year': '1979', 'rating': '9.9', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'source': 'langchain'}),\n",
|
||||||
|
" Document(page_content='Leo DiCaprio gets lost in a dream within a dream within a dream within a ...', metadata={'lang': 'eng', 'offset': '0', 'len': '76', 'year': '2010', 'director': 'Christopher Nolan', 'rating': '8.2', 'source': 'langchain'}),\n",
|
||||||
|
" Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'lang': 'eng', 'offset': '0', 'len': '116', 'year': '2006', 'director': 'Satoshi Kon', 'rating': '8.6', 'source': 'langchain'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example only specifies a relevant query\n",
|
||||||
|
"retriever.get_relevant_documents(\"What are some movies about dinosaurs\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "fc3f1e6e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query=' ' filter=Comparison(comparator=<Comparator.GT: 'gt'>, attribute='rating', value=8.5) limit=None\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'lang': 'eng', 'offset': '0', 'len': '60', 'year': '1979', 'rating': '9.9', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'source': 'langchain'}),\n",
|
||||||
|
" Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'lang': 'eng', 'offset': '0', 'len': '116', 'year': '2006', 'director': 'Satoshi Kon', 'rating': '8.6', 'source': 'langchain'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example only specifies a filter\n",
|
||||||
|
"retriever.get_relevant_documents(\"I want to watch a movie rated higher than 8.5\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "b19d4da0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query='women' filter=Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='director', value='Greta Gerwig') limit=None\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them', metadata={'lang': 'eng', 'offset': '0', 'len': '82', 'year': '2019', 'director': 'Greta Gerwig', 'rating': '8.3', 'source': 'langchain'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example specifies a query and a filter\n",
|
||||||
|
"retriever.get_relevant_documents(\"Has Greta Gerwig directed any movies about women\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "f900e40e",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query=' ' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.GTE: 'gte'>, attribute='rating', value=8.5), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='genre', value='science fiction')]) limit=None\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'lang': 'eng', 'offset': '0', 'len': '60', 'year': '1979', 'rating': '9.9', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'source': 'langchain'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example specifies a composite filter\n",
|
||||||
|
"retriever.get_relevant_documents(\n",
|
||||||
|
" \"What's a highly rated (above 8.5) science fiction film?\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "12a51522",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query='toys' filter=Operation(operator=<Operator.AND: 'and'>, arguments=[Comparison(comparator=<Comparator.GT: 'gt'>, attribute='year', value=1990), Comparison(comparator=<Comparator.LT: 'lt'>, attribute='year', value=2005), Comparison(comparator=<Comparator.EQ: 'eq'>, attribute='genre', value='animated')]) limit=None\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='Toys come alive and have a blast doing so', metadata={'lang': 'eng', 'offset': '0', 'len': '41', 'year': '1995', 'genre': 'animated', 'source': 'langchain'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example specifies a query and composite filter\n",
|
||||||
|
"retriever.get_relevant_documents(\n",
|
||||||
|
" \"What's a movie after 1990 but before 2005 that's all about toys, and preferably is animated\"\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "39bd1de1-b9fe-4a98-89da-58d8a7a6ae51",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Filter k\n",
|
||||||
|
"\n",
|
||||||
|
"We can also use the self query retriever to specify `k`: the number of documents to fetch.\n",
|
||||||
|
"\n",
|
||||||
|
"We can do this by passing `enable_limit=True` to the constructor."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "bff36b88-b506-4877-9c63-e5a1a8d78e64",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"retriever = SelfQueryRetriever.from_llm(\n",
|
||||||
|
" llm,\n",
|
||||||
|
" vectara,\n",
|
||||||
|
" document_content_description,\n",
|
||||||
|
" metadata_field_info,\n",
|
||||||
|
" enable_limit=True,\n",
|
||||||
|
" verbose=True,\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "2758d229-4f97-499c-819f-888acaf8ee10",
|
||||||
|
"metadata": {
|
||||||
|
"tags": []
|
||||||
|
},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"query='dinosaur' filter=None limit=2\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'lang': 'eng', 'offset': '0', 'len': '66', 'year': '1993', 'rating': '7.7', 'genre': 'science fiction', 'source': 'langchain'}),\n",
|
||||||
|
" Document(page_content='Toys come alive and have a blast doing so', metadata={'lang': 'eng', 'offset': '0', 'len': '41', 'year': '1995', 'genre': 'animated', 'source': 'langchain'})]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"# This example only specifies a relevant query\n",
|
||||||
|
"retriever.get_relevant_documents(\"what are two movies about dinosaurs\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.9"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -1,2 +1,2 @@
|
|||||||
label: 'More'
|
label: 'More'
|
||||||
position: 1
|
position: 2
|
@ -0,0 +1,307 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "7f0b0c06-ee70-468c-8bf5-b023f9e5e0a2",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Diffbot Graph Transformer\n",
|
||||||
|
"\n",
|
||||||
|
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/more/graph/diffbot_graphtransformer.ipynb)\n",
|
||||||
|
"\n",
|
||||||
|
"## Use case\n",
|
||||||
|
"\n",
|
||||||
|
"Text data often contain rich relationships and insights that can be useful for various analytics, recommendation engines, or knowledge management applications.\n",
|
||||||
|
"\n",
|
||||||
|
"Diffbot's NLP API allows for the extraction of entities, relationships, and semantic meaning from unstructured text data.\n",
|
||||||
|
"\n",
|
||||||
|
"By coupling Diffbot's NLP API with Neo4j, a graph database, you can create powerful, dynamic graph structures based on the information extracted from text. These graph structures are fully queryable and can be integrated into various applications.\n",
|
||||||
|
"\n",
|
||||||
|
"This combination allows for use cases such as:\n",
|
||||||
|
"\n",
|
||||||
|
"* Building knowledge graphs from textual documents, websites, or social media feeds.\n",
|
||||||
|
"* Generating recommendations based on semantic relationships in the data.\n",
|
||||||
|
"* Creating advanced search features that understand the relationships between entities.\n",
|
||||||
|
"* Building analytics dashboards that allow users to explore the hidden relationships in data.\n",
|
||||||
|
"\n",
|
||||||
|
"## Overview\n",
|
||||||
|
"\n",
|
||||||
|
"LangChain provides tools to interact with Graph Databases:\n",
|
||||||
|
"\n",
|
||||||
|
"1. `Construct knowledge graphs from text` using graph transformer and store integrations \n",
|
||||||
|
"2. `Query a graph database` using chains for query creation and execution\n",
|
||||||
|
"3. `Interact with a graph database` using agents for robust and flexible querying \n",
|
||||||
|
"\n",
|
||||||
|
"## Quickstart\n",
|
||||||
|
"\n",
|
||||||
|
"First, get required packages and set environment variables:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "975648da-b24f-4164-a671-6772179e12df",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"!pip install langchain langchain-experimental openai neo4j wikipedia"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "77718977-629e-46c2-b091-f9191b9ec569",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Diffbot NLP Service\n",
|
||||||
|
"\n",
|
||||||
|
"Diffbot's NLP service is a tool for extracting entities, relationships, and semantic context from unstructured text data.\n",
|
||||||
|
"This extracted information can be used to construct a knowledge graph.\n",
|
||||||
|
"To use their service, you'll need to obtain an API key from [Diffbot](https://www.diffbot.com/products/natural-language/)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "2cbf97d0-3682-439b-8750-b695ff726789",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer\n",
|
||||||
|
"\n",
|
||||||
|
"diffbot_api_key = \"DIFFBOT_API_KEY\"\n",
|
||||||
|
"diffbot_nlp = DiffbotGraphTransformer(diffbot_api_key=diffbot_api_key)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "5e3b894a-e3ee-46c7-8116-f8377f8f0159",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This code fetches Wikipedia articles about \"Warren Buffett\" and then uses `DiffbotGraphTransformer` to extract entities and relationships.\n",
|
||||||
|
"The `DiffbotGraphTransformer` outputs a structured data `GraphDocument`, which can be used to populate a graph database.\n",
|
||||||
|
"Note that text chunking is avoided due to Diffbot's [character limit per API request](https://docs.diffbot.com/reference/introduction-to-natural-language-api)."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"id": "53f8df86-47a1-44a1-9a0f-6725b90703bc",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import WikipediaLoader\n",
|
||||||
|
"\n",
|
||||||
|
"query = \"Warren Buffett\"\n",
|
||||||
|
"raw_documents = WikipediaLoader(query=query).load()\n",
|
||||||
|
"graph_documents = diffbot_nlp.convert_to_graph_documents(raw_documents)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "31bb851a-aab4-4b97-a6b7-fce397d32b47",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Loading the data into a knowledge graph\n",
|
||||||
|
"\n",
|
||||||
|
"You will need to have a running Neo4j instance. One option is to create a [free Neo4j database instance in their Aura cloud service](https://neo4j.com/cloud/platform/aura-graph-database/). You can also run the database locally using the [Neo4j Desktop application](https://neo4j.com/download/), or running a docker container. You can run a local docker container by running the executing the following script:\n",
|
||||||
|
"```\n",
|
||||||
|
"docker run \\\n",
|
||||||
|
" --name neo4j \\\n",
|
||||||
|
" -p 7474:7474 -p 7687:7687 \\\n",
|
||||||
|
" -d \\\n",
|
||||||
|
" -e NEO4J_AUTH=neo4j/pleaseletmein \\\n",
|
||||||
|
" -e NEO4J_PLUGINS=\\[\\\"apoc\\\"\\] \\\n",
|
||||||
|
" neo4j:latest\n",
|
||||||
|
"``` \n",
|
||||||
|
"If you are using the docker container, you need to wait a couple of second for the database to start."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"id": "0b2b6641-5a5d-467c-b148-e6aad5e4baa7",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.graphs import Neo4jGraph\n",
|
||||||
|
"\n",
|
||||||
|
"url=\"bolt://localhost:7687\"\n",
|
||||||
|
"username=\"neo4j\"\n",
|
||||||
|
"password=\"pleaseletmein\"\n",
|
||||||
|
"\n",
|
||||||
|
"graph = Neo4jGraph(\n",
|
||||||
|
" url=url,\n",
|
||||||
|
" username=username, \n",
|
||||||
|
" password=password\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "0b15e840-fe6f-45db-9193-1b4e2df5c12c",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"The `GraphDocuments` can be loaded into a knowledge graph using the `add_graph_documents` method."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"id": "1a67c4a8-955c-42a2-9c5d-de3ac0e640ec",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"graph.add_graph_documents(graph_documents)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "ed411e05-2b03-460d-997e-938482774f40",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Refresh graph schema information\n",
|
||||||
|
"If the schema of database changes, you can refresh the schema information needed to generate Cypher statements"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "904c9ee3-787c-403f-857d-459ce5ad5a1b",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"graph.refresh_schema()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "f19d1387-5899-4258-8c94-8ef5fa7db464",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Querying the graph\n",
|
||||||
|
"We can now use the graph cypher QA chain to ask question of the graph. It is advisable to use **gpt-4** to construct Cypher queries to get the best experience."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"id": "9393b732-67c8-45c1-9ec2-089f49c62448",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chains import GraphCypherQAChain\n",
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"\n",
|
||||||
|
"chain = GraphCypherQAChain.from_llm(\n",
|
||||||
|
" cypher_llm=ChatOpenAI(temperature=0, model_name=\"gpt-4\"),\n",
|
||||||
|
" qa_llm=ChatOpenAI(temperature=0, model_name=\"gpt-3.5-turbo\"),\n",
|
||||||
|
" graph=graph, verbose=True,\n",
|
||||||
|
" \n",
|
||||||
|
")\n",
|
||||||
|
" "
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 8,
|
||||||
|
"id": "1a9b3652-b436-404d-aa25-5fb576f23dc0",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
|
||||||
|
"Generated Cypher:\n",
|
||||||
|
"\u001b[32;1m\u001b[1;3mMATCH (p:Person {name: \"Warren Buffett\"})-[:EDUCATED_AT]->(o:Organization)\n",
|
||||||
|
"RETURN o.name\u001b[0m\n",
|
||||||
|
"Full Context:\n",
|
||||||
|
"\u001b[32;1m\u001b[1;3m[{'o.name': 'New York Institute of Finance'}, {'o.name': 'Alice Deal Junior High School'}, {'o.name': 'Woodrow Wilson High School'}, {'o.name': 'University of Nebraska'}]\u001b[0m\n",
|
||||||
|
"\n",
|
||||||
|
"\u001b[1m> Finished chain.\u001b[0m\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'Warren Buffett attended the University of Nebraska.'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 8,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain.run(\"Which university did Warren Buffett attend?\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 9,
|
||||||
|
"id": "adc0ba0f-a62c-4875-89ce-da717f3ab148",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
|
||||||
|
"Generated Cypher:\n",
|
||||||
|
"\u001b[32;1m\u001b[1;3mMATCH (p:Person)-[r:EMPLOYEE_OR_MEMBER_OF]->(o:Organization) WHERE o.name = 'Berkshire Hathaway' RETURN p.name\u001b[0m\n",
|
||||||
|
"Full Context:\n",
|
||||||
|
"\u001b[32;1m\u001b[1;3m[{'p.name': 'Charlie Munger'}, {'p.name': 'Oliver Chace'}, {'p.name': 'Howard Buffett'}, {'p.name': 'Howard'}, {'p.name': 'Susan Buffett'}, {'p.name': 'Warren Buffett'}]\u001b[0m\n",
|
||||||
|
"\n",
|
||||||
|
"\u001b[1m> Finished chain.\u001b[0m\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"'Charlie Munger, Oliver Chace, Howard Buffett, Susan Buffett, and Warren Buffett are or were working at Berkshire Hathaway.'"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 9,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"chain.run(\"Who is or was working at Berkshire Hathaway?\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "d636954b-d967-4e96-9489-92e11c74af35",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.4"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -0,0 +1,3 @@
|
|||||||
|
label: 'QA over structured data'
|
||||||
|
collapsed: false
|
||||||
|
position: 0.5
|
@ -0,0 +1 @@
|
|||||||
|
label: 'Integration-specific'
|
@ -0,0 +1,158 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Elasticsearch\n",
|
||||||
|
"\n",
|
||||||
|
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/qa_structured/integrations/elasticsearch.ipynb)\n",
|
||||||
|
"\n",
|
||||||
|
"We can use LLMs to interact with Elasticsearch analytics databases in natural language.\n",
|
||||||
|
"\n",
|
||||||
|
"This chain builds search queries via the Elasticsearch DSL API (filters and aggregations).\n",
|
||||||
|
"\n",
|
||||||
|
"The Elasticsearch client must have permissions for index listing, mapping description and search queries.\n",
|
||||||
|
"\n",
|
||||||
|
"See [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html) for instructions on how to run Elasticsearch locally."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"! pip install langchain langchain-experimental openai elasticsearch\n",
|
||||||
|
"\n",
|
||||||
|
"# Set env var OPENAI_API_KEY or load from a .env file\n",
|
||||||
|
"# import dotenv\n",
|
||||||
|
"\n",
|
||||||
|
"# dotenv.load_dotenv()"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 15,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from elasticsearch import Elasticsearch\n",
|
||||||
|
"\n",
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"from langchain.chains.elasticsearch_database import ElasticsearchDatabaseChain"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# Initialize Elasticsearch python client.\n",
|
||||||
|
"# See https://elasticsearch-py.readthedocs.io/en/v8.8.2/api.html#elasticsearch.Elasticsearch\n",
|
||||||
|
"ELASTIC_SEARCH_SERVER = \"https://elastic:pass@localhost:9200\"\n",
|
||||||
|
"db = Elasticsearch(ELASTIC_SEARCH_SERVER)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"Uncomment the next cell to initially populate your db."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"# customers = [\n",
|
||||||
|
"# {\"firstname\": \"Jennifer\", \"lastname\": \"Walters\"},\n",
|
||||||
|
"# {\"firstname\": \"Monica\",\"lastname\":\"Rambeau\"},\n",
|
||||||
|
"# {\"firstname\": \"Carol\",\"lastname\":\"Danvers\"},\n",
|
||||||
|
"# {\"firstname\": \"Wanda\",\"lastname\":\"Maximoff\"},\n",
|
||||||
|
"# {\"firstname\": \"Jennifer\",\"lastname\":\"Takeda\"},\n",
|
||||||
|
"# ]\n",
|
||||||
|
"# for i, customer in enumerate(customers):\n",
|
||||||
|
"# db.create(index=\"customers\", document=customer, id=i)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"llm = ChatOpenAI(model_name=\"gpt-4\", temperature=0)\n",
|
||||||
|
"chain = ElasticsearchDatabaseChain.from_llm(llm=llm, database=db, verbose=True)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"question = \"What are the first names of all the customers?\"\n",
|
||||||
|
"chain.run(question)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"We can customize the prompt."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chains.elasticsearch_database.prompts import DEFAULT_DSL_TEMPLATE\n",
|
||||||
|
"from langchain.prompts.prompt import PromptTemplate\n",
|
||||||
|
"\n",
|
||||||
|
"PROMPT_TEMPLATE = \"\"\"Given an input question, create a syntactically correct Elasticsearch query to run. Unless the user specifies in their question a specific number of examples they wish to obtain, always limit your query to at most {top_k} results. You can order the results by a relevant column to return the most interesting examples in the database.\n",
|
||||||
|
"\n",
|
||||||
|
"Unless told to do not query for all the columns from a specific index, only ask for a the few relevant columns given the question.\n",
|
||||||
|
"\n",
|
||||||
|
"Pay attention to use only the column names that you can see in the mapping description. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which index. Return the query as valid json.\n",
|
||||||
|
"\n",
|
||||||
|
"Use the following format:\n",
|
||||||
|
"\n",
|
||||||
|
"Question: Question here\n",
|
||||||
|
"ESQuery: Elasticsearch Query formatted as json\n",
|
||||||
|
"\"\"\"\n",
|
||||||
|
"\n",
|
||||||
|
"PROMPT = PromptTemplate.from_template(\n",
|
||||||
|
" PROMPT_TEMPLATE,\n",
|
||||||
|
")\n",
|
||||||
|
"chain = ElasticsearchDatabaseChain.from_llm(llm=llm, database=db, query_prompt=PROMPT)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.1"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 4
|
||||||
|
}
|
@ -0,0 +1,200 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "245065c6",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Vector SQL Retriever with MyScale\n",
|
||||||
|
"\n",
|
||||||
|
">[MyScale](https://docs.myscale.com/en/) is an integrated vector database. You can access your database in SQL and also from here, LangChain. MyScale can make a use of [various data types and functions for filters](https://blog.myscale.com/2023/06/06/why-integrated-database-solution-can-boost-your-llm-apps/#filter-on-anything-without-constraints). It will boost up your LLM app no matter if you are scaling up your data or expand your system to broader application."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "0246c5bf",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"!pip3 install clickhouse-sqlalchemy InstructorEmbedding sentence_transformers openai langchain-experimental"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "7585d2c3",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"from os import environ\n",
|
||||||
|
"import getpass\n",
|
||||||
|
"from typing import Dict, Any\n",
|
||||||
|
"from langchain import OpenAI, SQLDatabase, LLMChain\n",
|
||||||
|
"from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n",
|
||||||
|
"from sqlalchemy import create_engine, Column, MetaData\n",
|
||||||
|
"from langchain import PromptTemplate\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"from sqlalchemy import create_engine\n",
|
||||||
|
"\n",
|
||||||
|
"MYSCALE_HOST = \"msc-1decbcc9.us-east-1.aws.staging.myscale.cloud\"\n",
|
||||||
|
"MYSCALE_PORT = 443\n",
|
||||||
|
"MYSCALE_USER = \"chatdata\"\n",
|
||||||
|
"MYSCALE_PASSWORD = \"myscale_rocks\"\n",
|
||||||
|
"OPENAI_API_KEY = getpass.getpass(\"OpenAI API Key:\")\n",
|
||||||
|
"\n",
|
||||||
|
"engine = create_engine(\n",
|
||||||
|
" f\"clickhouse://{MYSCALE_USER}:{MYSCALE_PASSWORD}@{MYSCALE_HOST}:{MYSCALE_PORT}/default?protocol=https\"\n",
|
||||||
|
")\n",
|
||||||
|
"metadata = MetaData(bind=engine)\n",
|
||||||
|
"environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "e08d9ddc",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.embeddings import HuggingFaceInstructEmbeddings\n",
|
||||||
|
"from langchain_experimental.sql.vector_sql import VectorSQLOutputParser\n",
|
||||||
|
"\n",
|
||||||
|
"output_parser = VectorSQLOutputParser.from_embeddings(\n",
|
||||||
|
" model=HuggingFaceInstructEmbeddings(\n",
|
||||||
|
" model_name=\"hkunlp/instructor-xl\", model_kwargs={\"device\": \"cpu\"}\n",
|
||||||
|
" )\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "84b705b2",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"\n",
|
||||||
|
"from langchain.llms import OpenAI\n",
|
||||||
|
"from langchain.callbacks import StdOutCallbackHandler\n",
|
||||||
|
"\n",
|
||||||
|
"from langchain.utilities.sql_database import SQLDatabase\n",
|
||||||
|
"from langchain_experimental.sql.prompt import MYSCALE_PROMPT\n",
|
||||||
|
"from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n",
|
||||||
|
"\n",
|
||||||
|
"chain = VectorSQLDatabaseChain(\n",
|
||||||
|
" llm_chain=LLMChain(\n",
|
||||||
|
" llm=OpenAI(openai_api_key=OPENAI_API_KEY, temperature=0),\n",
|
||||||
|
" prompt=MYSCALE_PROMPT,\n",
|
||||||
|
" ),\n",
|
||||||
|
" top_k=10,\n",
|
||||||
|
" return_direct=True,\n",
|
||||||
|
" sql_cmd_parser=output_parser,\n",
|
||||||
|
" database=SQLDatabase(engine, None, metadata),\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"import pandas as pd\n",
|
||||||
|
"\n",
|
||||||
|
"pd.DataFrame(\n",
|
||||||
|
" chain.run(\n",
|
||||||
|
" \"Please give me 10 papers to ask what is PageRank?\",\n",
|
||||||
|
" callbacks=[StdOutCallbackHandler()],\n",
|
||||||
|
" )\n",
|
||||||
|
")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "6c09cda0",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## SQL Database as Retriever"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "734d7ff5",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.chat_models import ChatOpenAI\n",
|
||||||
|
"from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain\n",
|
||||||
|
"\n",
|
||||||
|
"from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n",
|
||||||
|
"from langchain_experimental.retrievers.vector_sql_database \\\n",
|
||||||
|
" import VectorSQLDatabaseChainRetriever\n",
|
||||||
|
"from langchain_experimental.sql.prompt import MYSCALE_PROMPT\n",
|
||||||
|
"from langchain_experimental.sql.vector_sql import VectorSQLRetrieveAllOutputParser\n",
|
||||||
|
"\n",
|
||||||
|
"output_parser_retrieve_all = VectorSQLRetrieveAllOutputParser.from_embeddings(\n",
|
||||||
|
" output_parser.model\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"chain = VectorSQLDatabaseChain.from_llm(\n",
|
||||||
|
" llm=OpenAI(openai_api_key=OPENAI_API_KEY, temperature=0),\n",
|
||||||
|
" prompt=MYSCALE_PROMPT,\n",
|
||||||
|
" top_k=10,\n",
|
||||||
|
" return_direct=True,\n",
|
||||||
|
" db=SQLDatabase(engine, None, metadata),\n",
|
||||||
|
" sql_cmd_parser=output_parser_retrieve_all,\n",
|
||||||
|
" native_format=True,\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"# You need all those keys to get docs\n",
|
||||||
|
"retriever = VectorSQLDatabaseChainRetriever(sql_db_chain=chain, page_content_key=\"abstract\")\n",
|
||||||
|
"\n",
|
||||||
|
"document_with_metadata_prompt = PromptTemplate(\n",
|
||||||
|
" input_variables=[\"page_content\", \"id\", \"title\", \"authors\", \"pubdate\", \"categories\"],\n",
|
||||||
|
" template=\"Content:\\n\\tTitle: {title}\\n\\tAbstract: {page_content}\\n\\tAuthors: {authors}\\n\\tDate of Publication: {pubdate}\\n\\tCategories: {categories}\\nSOURCE: {id}\",\n",
|
||||||
|
")\n",
|
||||||
|
"\n",
|
||||||
|
"chain = RetrievalQAWithSourcesChain.from_chain_type(\n",
|
||||||
|
" ChatOpenAI(\n",
|
||||||
|
" model_name=\"gpt-3.5-turbo-16k\", openai_api_key=OPENAI_API_KEY, temperature=0.6\n",
|
||||||
|
" ),\n",
|
||||||
|
" retriever=retriever,\n",
|
||||||
|
" chain_type=\"stuff\",\n",
|
||||||
|
" chain_type_kwargs={\n",
|
||||||
|
" \"document_prompt\": document_with_metadata_prompt,\n",
|
||||||
|
" },\n",
|
||||||
|
" return_source_documents=True,\n",
|
||||||
|
")\n",
|
||||||
|
"ans = chain(\"Please give me 10 papers to ask what is PageRank?\",\n",
|
||||||
|
" callbacks=[StdOutCallbackHandler()])\n",
|
||||||
|
"print(ans[\"answer\"])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "4948ff25",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.11.3"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,15 @@
|
|||||||
|
# Vearch
|
||||||
|
|
||||||
|
Vearch is a scalable distributed system for efficient similarity search of deep learning vectors.
|
||||||
|
|
||||||
|
# Installation and Setup
|
||||||
|
|
||||||
|
Vearch Python SDK enables vearch to use locally. Vearch python sdk can be installed easily by pip install vearch.
|
||||||
|
|
||||||
|
# Vectorstore
|
||||||
|
|
||||||
|
Vearch also can used as vectorstore. Most detalis in [this notebook](docs/modules/indexes/vectorstores/examples/vearch.ipynb)
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain.vectorstores import Vearch
|
||||||
|
```
|
@ -1,4 +1,7 @@
|
|||||||
"""Data anonymizer package"""
|
"""Data anonymizer package"""
|
||||||
from langchain_experimental.data_anonymizer.presidio import PresidioAnonymizer
|
from langchain_experimental.data_anonymizer.presidio import (
|
||||||
|
PresidioAnonymizer,
|
||||||
|
PresidioReversibleAnonymizer,
|
||||||
|
)
|
||||||
|
|
||||||
__all__ = ["PresidioAnonymizer"]
|
__all__ = ["PresidioAnonymizer", "PresidioReversibleAnonymizer"]
|
||||||
|
@ -0,0 +1,21 @@
|
|||||||
|
from collections import defaultdict
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
MappingDataType = Dict[str, Dict[str, str]]
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DeanonymizerMapping:
|
||||||
|
mapping: MappingDataType = field(
|
||||||
|
default_factory=lambda: defaultdict(lambda: defaultdict(str))
|
||||||
|
)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def data(self) -> MappingDataType:
|
||||||
|
"""Return the deanonymizer mapping"""
|
||||||
|
return {k: dict(v) for k, v in self.mapping.items()}
|
||||||
|
|
||||||
|
def update(self, new_mapping: MappingDataType) -> None:
|
||||||
|
for entity_type, values in new_mapping.items():
|
||||||
|
self.mapping[entity_type].update(values)
|
@ -0,0 +1,17 @@
|
|||||||
|
from langchain_experimental.data_anonymizer.presidio import MappingDataType
|
||||||
|
|
||||||
|
|
||||||
|
def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:
|
||||||
|
"""
|
||||||
|
Default matching strategy for deanonymization.
|
||||||
|
It replaces all the anonymized entities with the original ones.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: text to deanonymize
|
||||||
|
deanonymizer_mapping: mapping between anonymized entities and original ones"""
|
||||||
|
|
||||||
|
# Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)
|
||||||
|
for entity_type in deanonymizer_mapping:
|
||||||
|
for anonymized, original in deanonymizer_mapping[entity_type].items():
|
||||||
|
text = text.replace(anonymized, original)
|
||||||
|
return text
|
@ -0,0 +1,5 @@
|
|||||||
|
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"DiffbotGraphTransformer",
|
||||||
|
]
|
@ -0,0 +1,316 @@
|
|||||||
|
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from langchain.graphs.graph_document import GraphDocument, Node, Relationship
|
||||||
|
from langchain.schema import Document
|
||||||
|
from langchain.utils import get_from_env
|
||||||
|
|
||||||
|
|
||||||
|
def format_property_key(s: str) -> str:
|
||||||
|
words = s.split()
|
||||||
|
if not words:
|
||||||
|
return s
|
||||||
|
first_word = words[0].lower()
|
||||||
|
capitalized_words = [word.capitalize() for word in words[1:]]
|
||||||
|
return "".join([first_word] + capitalized_words)
|
||||||
|
|
||||||
|
|
||||||
|
class NodesList:
|
||||||
|
"""
|
||||||
|
Manages a list of nodes with associated properties.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
nodes (Dict[Tuple, Any]): Stores nodes as keys and their properties as values.
|
||||||
|
Each key is a tuple where the first element is the
|
||||||
|
node ID and the second is the node type.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.nodes: Dict[Tuple[Union[str, int], str], Any] = dict()
|
||||||
|
|
||||||
|
def add_node_property(
|
||||||
|
self, node: Tuple[Union[str, int], str], properties: Dict[str, Any]
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Adds or updates node properties.
|
||||||
|
|
||||||
|
If the node does not exist in the list, it's added along with its properties.
|
||||||
|
If the node already exists, its properties are updated with the new values.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
node (Tuple): A tuple containing the node ID and node type.
|
||||||
|
properties (Dict): A dictionary of properties to add or update for the node.
|
||||||
|
"""
|
||||||
|
if node not in self.nodes:
|
||||||
|
self.nodes[node] = properties
|
||||||
|
else:
|
||||||
|
self.nodes[node].update(properties)
|
||||||
|
|
||||||
|
def return_node_list(self) -> List[Node]:
|
||||||
|
"""
|
||||||
|
Returns the nodes as a list of Node objects.
|
||||||
|
|
||||||
|
Each Node object will have its ID, type, and properties populated.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List[Node]: A list of Node objects.
|
||||||
|
"""
|
||||||
|
nodes = [
|
||||||
|
Node(id=key[0], type=key[1], properties=self.nodes[key])
|
||||||
|
for key in self.nodes
|
||||||
|
]
|
||||||
|
return nodes
|
||||||
|
|
||||||
|
|
||||||
|
# Properties that should be treated as node properties instead of relationships
|
||||||
|
FACT_TO_PROPERTY_TYPE = [
|
||||||
|
"Date",
|
||||||
|
"Number",
|
||||||
|
"Job title",
|
||||||
|
"Cause of death",
|
||||||
|
"Organization type",
|
||||||
|
"Academic title",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
schema_mapping = [
|
||||||
|
("HEADQUARTERS", "ORGANIZATION_LOCATIONS"),
|
||||||
|
("RESIDENCE", "PERSON_LOCATION"),
|
||||||
|
("ALL_PERSON_LOCATIONS", "PERSON_LOCATION"),
|
||||||
|
("CHILD", "HAS_CHILD"),
|
||||||
|
("PARENT", "HAS_PARENT"),
|
||||||
|
("CUSTOMERS", "HAS_CUSTOMER"),
|
||||||
|
("SKILLED_AT", "INTERESTED_IN"),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class SimplifiedSchema:
|
||||||
|
"""
|
||||||
|
Provides functionality for working with a simplified schema mapping.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
schema (Dict): A dictionary containing the mapping to simplified schema types.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self) -> None:
|
||||||
|
"""Initializes the schema dictionary based on the predefined list."""
|
||||||
|
self.schema = dict()
|
||||||
|
for row in schema_mapping:
|
||||||
|
self.schema[row[0]] = row[1]
|
||||||
|
|
||||||
|
def get_type(self, type: str) -> str:
|
||||||
|
"""
|
||||||
|
Retrieves the simplified schema type for a given original type.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
type (str): The original schema type to find the simplified type for.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: The simplified schema type if it exists;
|
||||||
|
otherwise, returns the original type.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
return self.schema[type]
|
||||||
|
except KeyError:
|
||||||
|
return type
|
||||||
|
|
||||||
|
|
||||||
|
class DiffbotGraphTransformer:
|
||||||
|
"""Transforms documents into graph documents using Diffbot's NLP API.
|
||||||
|
|
||||||
|
A graph document transformation system takes a sequence of Documents and returns a
|
||||||
|
sequence of Graph Documents.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
.. code-block:: python
|
||||||
|
|
||||||
|
class DiffbotGraphTransformer(BaseGraphDocumentTransformer):
|
||||||
|
|
||||||
|
def transform_documents(
|
||||||
|
self, documents: Sequence[Document], **kwargs: Any
|
||||||
|
) -> Sequence[GraphDocument]:
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for document in documents:
|
||||||
|
raw_results = self.nlp_request(document.page_content)
|
||||||
|
graph_document = self.process_response(raw_results, document)
|
||||||
|
results.append(graph_document)
|
||||||
|
return results
|
||||||
|
|
||||||
|
async def atransform_documents(
|
||||||
|
self, documents: Sequence[Document], **kwargs: Any
|
||||||
|
) -> Sequence[Document]:
|
||||||
|
raise NotImplementedError
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
diffbot_api_key: Optional[str] = None,
|
||||||
|
fact_confidence_threshold: float = 0.7,
|
||||||
|
include_qualifiers: bool = True,
|
||||||
|
include_evidence: bool = True,
|
||||||
|
simplified_schema: bool = True,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Initialize the graph transformer with various options.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
diffbot_api_key (str):
|
||||||
|
The API key for Diffbot's NLP services.
|
||||||
|
|
||||||
|
fact_confidence_threshold (float):
|
||||||
|
Minimum confidence level for facts to be included.
|
||||||
|
include_qualifiers (bool):
|
||||||
|
Whether to include qualifiers in the relationships.
|
||||||
|
include_evidence (bool):
|
||||||
|
Whether to include evidence for the relationships.
|
||||||
|
simplified_schema (bool):
|
||||||
|
Whether to use a simplified schema for relationships.
|
||||||
|
"""
|
||||||
|
self.diffbot_api_key = diffbot_api_key or get_from_env(
|
||||||
|
"diffbot_api_key", "DIFFBOT_API_KEY"
|
||||||
|
)
|
||||||
|
self.fact_threshold_confidence = fact_confidence_threshold
|
||||||
|
self.include_qualifiers = include_qualifiers
|
||||||
|
self.include_evidence = include_evidence
|
||||||
|
self.simplified_schema = None
|
||||||
|
if simplified_schema:
|
||||||
|
self.simplified_schema = SimplifiedSchema()
|
||||||
|
|
||||||
|
def nlp_request(self, text: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Make an API request to the Diffbot NLP endpoint.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text (str): The text to be processed.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, Any]: The JSON response from the API.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Relationship extraction only works for English
|
||||||
|
payload = {
|
||||||
|
"content": text,
|
||||||
|
"lang": "en",
|
||||||
|
}
|
||||||
|
|
||||||
|
FIELDS = "facts"
|
||||||
|
HOST = "nl.diffbot.com"
|
||||||
|
url = (
|
||||||
|
f"https://{HOST}/v1/?fields={FIELDS}&"
|
||||||
|
f"token={self.diffbot_api_key}&language=en"
|
||||||
|
)
|
||||||
|
result = requests.post(url, data=payload)
|
||||||
|
return result.json()
|
||||||
|
|
||||||
|
def process_response(
|
||||||
|
self, payload: Dict[str, Any], document: Document
|
||||||
|
) -> GraphDocument:
|
||||||
|
"""
|
||||||
|
Transform the Diffbot NLP response into a GraphDocument.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
payload (Dict[str, Any]): The JSON response from Diffbot's NLP API.
|
||||||
|
document (Document): The original document.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
GraphDocument: The transformed document as a graph.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Return empty result if there are no facts
|
||||||
|
if "facts" not in payload or not payload["facts"]:
|
||||||
|
return GraphDocument(nodes=[], relationships=[], source=document)
|
||||||
|
|
||||||
|
# Nodes are a custom class because we need to deduplicate
|
||||||
|
nodes_list = NodesList()
|
||||||
|
# Relationships are a list because we don't deduplicate nor anything else
|
||||||
|
relationships = list()
|
||||||
|
for record in payload["facts"]:
|
||||||
|
# Skip if the fact is below the threshold confidence
|
||||||
|
if record["confidence"] < self.fact_threshold_confidence:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# TODO: It should probably be treated as a node property
|
||||||
|
if not record["value"]["allTypes"]:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Define source node
|
||||||
|
source_id = (
|
||||||
|
record["entity"]["allUris"][0]
|
||||||
|
if record["entity"]["allUris"]
|
||||||
|
else record["entity"]["name"]
|
||||||
|
)
|
||||||
|
source_label = record["entity"]["allTypes"][0]["name"].capitalize()
|
||||||
|
source_name = record["entity"]["name"]
|
||||||
|
source_node = Node(id=source_id, type=source_label)
|
||||||
|
nodes_list.add_node_property(
|
||||||
|
(source_id, source_label), {"name": source_name}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Define target node
|
||||||
|
target_id = (
|
||||||
|
record["value"]["allUris"][0]
|
||||||
|
if record["value"]["allUris"]
|
||||||
|
else record["value"]["name"]
|
||||||
|
)
|
||||||
|
target_label = record["value"]["allTypes"][0]["name"].capitalize()
|
||||||
|
target_name = record["value"]["name"]
|
||||||
|
# Some facts are better suited as node properties
|
||||||
|
if target_label in FACT_TO_PROPERTY_TYPE:
|
||||||
|
nodes_list.add_node_property(
|
||||||
|
(source_id, source_label),
|
||||||
|
{format_property_key(record["property"]["name"]): target_name},
|
||||||
|
)
|
||||||
|
else: # Define relationship
|
||||||
|
# Define target node object
|
||||||
|
target_node = Node(id=target_id, type=target_label)
|
||||||
|
nodes_list.add_node_property(
|
||||||
|
(target_id, target_label), {"name": target_name}
|
||||||
|
)
|
||||||
|
# Define relationship type
|
||||||
|
rel_type = record["property"]["name"].replace(" ", "_").upper()
|
||||||
|
if self.simplified_schema:
|
||||||
|
rel_type = self.simplified_schema.get_type(rel_type)
|
||||||
|
|
||||||
|
# Relationship qualifiers/properties
|
||||||
|
rel_properties = dict()
|
||||||
|
relationship_evidence = [el["passage"] for el in record["evidence"]][0]
|
||||||
|
if self.include_evidence:
|
||||||
|
rel_properties.update({"evidence": relationship_evidence})
|
||||||
|
if self.include_qualifiers and record.get("qualifiers"):
|
||||||
|
for property in record["qualifiers"]:
|
||||||
|
prop_key = format_property_key(property["property"]["name"])
|
||||||
|
rel_properties[prop_key] = property["value"]["name"]
|
||||||
|
|
||||||
|
relationship = Relationship(
|
||||||
|
source=source_node,
|
||||||
|
target=target_node,
|
||||||
|
type=rel_type,
|
||||||
|
properties=rel_properties,
|
||||||
|
)
|
||||||
|
relationships.append(relationship)
|
||||||
|
|
||||||
|
return GraphDocument(
|
||||||
|
nodes=nodes_list.return_node_list(),
|
||||||
|
relationships=relationships,
|
||||||
|
source=document,
|
||||||
|
)
|
||||||
|
|
||||||
|
def convert_to_graph_documents(
|
||||||
|
self, documents: Sequence[Document]
|
||||||
|
) -> List[GraphDocument]:
|
||||||
|
"""Convert a sequence of documents into graph documents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
documents (Sequence[Document]): The original documents.
|
||||||
|
**kwargs: Additional keyword arguments.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Sequence[GraphDocument]: The transformed documents as graphs.
|
||||||
|
"""
|
||||||
|
results = []
|
||||||
|
for document in documents:
|
||||||
|
raw_results = self.nlp_request(document.page_content)
|
||||||
|
graph_document = self.process_response(raw_results, document)
|
||||||
|
results.append(graph_document)
|
||||||
|
return results
|
@ -0,0 +1,38 @@
|
|||||||
|
"""Vector SQL Database Chain Retriever"""
|
||||||
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
|
from langchain.callbacks.manager import (
|
||||||
|
AsyncCallbackManagerForRetrieverRun,
|
||||||
|
CallbackManagerForRetrieverRun,
|
||||||
|
)
|
||||||
|
from langchain.schema import BaseRetriever, Document
|
||||||
|
|
||||||
|
from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain
|
||||||
|
|
||||||
|
|
||||||
|
class VectorSQLDatabaseChainRetriever(BaseRetriever):
|
||||||
|
"""Retriever that uses SQLDatabase as Retriever"""
|
||||||
|
|
||||||
|
sql_db_chain: VectorSQLDatabaseChain
|
||||||
|
"""SQL Database Chain"""
|
||||||
|
page_content_key: str = "content"
|
||||||
|
"""column name for page content of documents"""
|
||||||
|
|
||||||
|
def _get_relevant_documents(
|
||||||
|
self,
|
||||||
|
query: str,
|
||||||
|
*,
|
||||||
|
run_manager: CallbackManagerForRetrieverRun,
|
||||||
|
**kwargs: Any,
|
||||||
|
) -> List[Document]:
|
||||||
|
ret: List[Dict[str, Any]] = self.sql_db_chain(
|
||||||
|
query, callbacks=run_manager.get_child(), **kwargs
|
||||||
|
)["result"]
|
||||||
|
return [
|
||||||
|
Document(page_content=r[self.page_content_key], metadata=r) for r in ret
|
||||||
|
]
|
||||||
|
|
||||||
|
async def _aget_relevant_documents(
|
||||||
|
self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun
|
||||||
|
) -> List[Document]:
|
||||||
|
raise NotImplementedError
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue