mirror of
https://github.com/hwchase17/langchain
synced 2024-11-08 07:10:35 +00:00
Data anonymizer notebook nit (#10062)
This commit is contained in:
parent
19400ba253
commit
8d66b00c73
@ -28,12 +28,12 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Install necessary packages\n",
|
||||
"# ! pip install langchain langchain-experimental openai\n",
|
||||
"# ! pip install langchain langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker\n",
|
||||
"# ! python -m spacy download en_core_web_lg"
|
||||
]
|
||||
},
|
||||
@ -47,16 +47,16 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'My name is Marie Santos, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'"
|
||||
"'My name is Mrs. Rachel Chen DDS, call me at 849-829-7628x073 or email me at christopherfrey@example.org'"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
@ -64,6 +64,92 @@
|
||||
"source": [
|
||||
"from langchain_experimental.data_anonymizer import PresidioAnonymizer\n",
|
||||
"\n",
|
||||
"anonymizer = PresidioAnonymizer()\n",
|
||||
"\n",
|
||||
"anonymizer.anonymize(\n",
|
||||
" \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com\"\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Using with LangChain Expression Language\n",
|
||||
"\n",
|
||||
"With LCEL we can easily chain together anonymization with the rest of our application."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Set env var OPENAI_API_KEY or load from a .env file:\n",
|
||||
"# import dotenv\n",
|
||||
"\n",
|
||||
"# dotenv.load_dotenv()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"AIMessage(content='You can find our super secret data at https://www.ross.com/', additional_kwargs={}, example=False)"
|
||||
]
|
||||
},
|
||||
"execution_count": 16,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.prompts.prompt import PromptTemplate\n",
|
||||
"from langchain.chat_models import ChatOpenAI\n",
|
||||
"from langchain.schema.runnable import RunnablePassthrough\n",
|
||||
"\n",
|
||||
"template = \"\"\"According to this text, where can you find our super secret data?\n",
|
||||
"\n",
|
||||
"{anonymized_text}\n",
|
||||
"\n",
|
||||
"Answer:\"\"\"\n",
|
||||
"prompt = PromptTemplate.from_template(template)\n",
|
||||
"llm = ChatOpenAI()\n",
|
||||
"\n",
|
||||
"chain = {\"anonymized_text\": anonymizer.anonymize} | prompt | llm\n",
|
||||
"chain.invoke(\"You can find our super secret data at https://supersecretdata.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Customization\n",
|
||||
"We can specify ``analyzed_fields`` to only anonymize particular types of data."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'My name is Gabrielle Edwards, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'"
|
||||
]
|
||||
},
|
||||
"execution_count": 18,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"anonymizer = PresidioAnonymizer(analyzed_fields=[\"PERSON\"])\n",
|
||||
"\n",
|
||||
"anonymizer.anonymize(\n",
|
||||
@ -75,7 +161,6 @@
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\\\n",
|
||||
"As can be observed, the name was correctly identified and replaced with another. The `analyzed_fields` attribute is responsible for what values are to be detected and substituted. We can add *PHONE_NUMBER* to the list:"
|
||||
]
|
||||
},
|
||||
@ -331,125 +416,6 @@
|
||||
"anonymizer.anonymize(\"My polish phone number is 666555444\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\\\n",
|
||||
"Finally, it is worth showing how to implement anonymizer as a chain. Since anonymization is based on string operations, we can use `TransformChain` for this:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'text': 'You can find our super secret data at https://supersecretdata.com',\n",
|
||||
" 'anonymized_text': 'You can find our super secret data at https://www.fox.org/'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from langchain.chains.transform import TransformChain\n",
|
||||
"\n",
|
||||
"anonymizer = PresidioAnonymizer()\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def anonymize_func(inputs: dict) -> dict:\n",
|
||||
" text = inputs[\"text\"]\n",
|
||||
" return {\"anonymized_text\": anonymizer.anonymize(text)}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"anonymize_chain = TransformChain(\n",
|
||||
" input_variables=[\"text\"],\n",
|
||||
" output_variables=[\"anonymized_text\"],\n",
|
||||
" transform=anonymize_func,\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"anonymize_chain(\"You can find our super secret data at https://supersecretdata.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\\\n",
|
||||
"Later, you can, for example, use such anonymization as part of chain sequence. We will use `LangChain Expression Language` ([learn more here](https://python.langchain.com/docs/guides/expression_language/)) for composing these chains together, as shown below:"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"True"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# ! pip install openai\n",
|
||||
"\n",
|
||||
"# Set env var OPENAI_API_KEY or load from a .env file:\n",
|
||||
"import dotenv\n",
|
||||
"\n",
|
||||
"dotenv.load_dotenv()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'anonymized_text': StringPromptValue(text='According to this text, where can you find our super secret data?\\n\\nYou can find our super secret data at https://evans-summers.info/\\n\\nAnswer:'),\n",
|
||||
" 'text': ' https://evans-summers.info/'}"
|
||||
]
|
||||
},
|
||||
"execution_count": 17,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from operator import itemgetter\n",
|
||||
"from langchain.prompts.prompt import PromptTemplate\n",
|
||||
"from langchain.chains.llm import LLMChain\n",
|
||||
"from langchain.llms.openai import OpenAI\n",
|
||||
"\n",
|
||||
"template = \"\"\"According to this text, where can you find our super secret data?\n",
|
||||
"\n",
|
||||
"{anonymized_text}\n",
|
||||
"\n",
|
||||
"Answer:\"\"\"\n",
|
||||
"prompt = PromptTemplate(input_variables=[\"anonymized_text\"], template=template)\n",
|
||||
"llm_chain = LLMChain(llm=OpenAI(), prompt=prompt)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"chain = (\n",
|
||||
" anonymize_chain\n",
|
||||
" | {\"anonymized_text\": itemgetter(\"anonymized_text\")}\n",
|
||||
" | prompt\n",
|
||||
" | llm_chain\n",
|
||||
")\n",
|
||||
"chain.invoke(\"You can find our super secret data at https://supersecretdata.com\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
Loading…
Reference in New Issue
Block a user