diff --git a/docs/extras/guides/privacy/presidio_data_anonymization.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization.ipynb index 7bb0b15934..faa9929259 100644 --- a/docs/extras/guides/privacy/presidio_data_anonymization.ipynb +++ b/docs/extras/guides/privacy/presidio_data_anonymization.ipynb @@ -28,12 +28,12 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Install necessary packages\n", - "# ! pip install langchain langchain-experimental openai\n", + "# ! pip install langchain langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker\n", "# ! python -m spacy download en_core_web_lg" ] }, @@ -47,16 +47,16 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'My name is Marie Santos, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'" + "'My name is Mrs. Rachel Chen DDS, call me at 849-829-7628x073 or email me at christopherfrey@example.org'" ] }, - "execution_count": 2, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -64,6 +64,92 @@ "source": [ "from langchain_experimental.data_anonymizer import PresidioAnonymizer\n", "\n", + "anonymizer = PresidioAnonymizer()\n", + "\n", + "anonymizer.anonymize(\n", + " \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using with LangChain Expression Language\n", + "\n", + "With LCEL we can easily chain together anonymization with the rest of our application." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set env var OPENAI_API_KEY or load from a .env file:\n", + "# import dotenv\n", + "\n", + "# dotenv.load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='You can find our super secret data at https://www.ross.com/', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain.prompts.prompt import PromptTemplate\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.schema.runnable import RunnablePassthrough\n", + "\n", + "template = \"\"\"According to this text, where can you find our super secret data?\n", + "\n", + "{anonymized_text}\n", + "\n", + "Answer:\"\"\"\n", + "prompt = PromptTemplate.from_template(template)\n", + "llm = ChatOpenAI()\n", + "\n", + "chain = {\"anonymized_text\": anonymizer.anonymize} | prompt | llm\n", + "chain.invoke(\"You can find our super secret data at https://supersecretdata.com\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Customization\n", + "We can specify ``analyzed_fields`` to only anonymize particular types of data." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'My name is Gabrielle Edwards, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ "anonymizer = PresidioAnonymizer(analyzed_fields=[\"PERSON\"])\n", "\n", "anonymizer.anonymize(\n", @@ -75,7 +161,6 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "\\\n", "As can be observed, the name was correctly identified and replaced with another. The `analyzed_fields` attribute is responsible for what values are to be detected and substituted. We can add *PHONE_NUMBER* to the list:" ] }, @@ -331,125 +416,6 @@ "anonymizer.anonymize(\"My polish phone number is 666555444\")" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\\\n", - "Finally, it is worth showing how to implement anonymizer as a chain. Since anonymization is based on string operations, we can use `TransformChain` for this:" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'text': 'You can find our super secret data at https://supersecretdata.com',\n", - " 'anonymized_text': 'You can find our super secret data at https://www.fox.org/'}" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from langchain.chains.transform import TransformChain\n", - "\n", - "anonymizer = PresidioAnonymizer()\n", - "\n", - "\n", - "def anonymize_func(inputs: dict) -> dict:\n", - " text = inputs[\"text\"]\n", - " return {\"anonymized_text\": anonymizer.anonymize(text)}\n", - "\n", - "\n", - "anonymize_chain = TransformChain(\n", - " input_variables=[\"text\"],\n", - " output_variables=[\"anonymized_text\"],\n", - " transform=anonymize_func,\n", - ")\n", - "\n", - "anonymize_chain(\"You can find our super secret data at https://supersecretdata.com\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\\\n", - "Later, you can, for example, use such anonymization as part of chain sequence. We will use `LangChain Expression Language` ([learn more here](https://python.langchain.com/docs/guides/expression_language/)) for composing these chains together, as shown below:" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# ! pip install openai\n", - "\n", - "# Set env var OPENAI_API_KEY or load from a .env file:\n", - "import dotenv\n", - "\n", - "dotenv.load_dotenv()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'anonymized_text': StringPromptValue(text='According to this text, where can you find our super secret data?\\n\\nYou can find our super secret data at https://evans-summers.info/\\n\\nAnswer:'),\n", - " 'text': ' https://evans-summers.info/'}" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from operator import itemgetter\n", - "from langchain.prompts.prompt import PromptTemplate\n", - "from langchain.chains.llm import LLMChain\n", - "from langchain.llms.openai import OpenAI\n", - "\n", - "template = \"\"\"According to this text, where can you find our super secret data?\n", - "\n", - "{anonymized_text}\n", - "\n", - "Answer:\"\"\"\n", - "prompt = PromptTemplate(input_variables=[\"anonymized_text\"], template=template)\n", - "llm_chain = LLMChain(llm=OpenAI(), prompt=prompt)\n", - "\n", - "\n", - "chain = (\n", - " anonymize_chain\n", - " | {\"anonymized_text\": itemgetter(\"anonymized_text\")}\n", - " | prompt\n", - " | llm_chain\n", - ")\n", - "chain.invoke(\"You can find our super secret data at https://supersecretdata.com\")" - ] - }, { "cell_type": "markdown", "metadata": {},