langchain/libs/experimental/langchain_experimental/data_anonymizer/presidio.py

from __future__ import annotations

from typing import TYPE_CHECKING, Dict, List, Optional

from langchain_experimental.data_anonymizer.base import AnonymizerBase
from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
    get_pseudoanonymizer_mapping,
)

if TYPE_CHECKING:
    from presidio_analyzer import EntityRecognizer
    from presidio_anonymizer.entities import OperatorConfig


class PresidioAnonymizer(AnonymizerBase):
    """Anonymizer using Microsoft Presidio."""

    def __init__(
        self,
        analyzed_fields: Optional[List[str]] = None,
        operators: Optional[Dict[str, OperatorConfig]] = None,
    ):
        """
        Args:
            analyzed_fields: List of fields to detect and then anonymize.
                Defaults to all entities supported by Microsoft Presidio.
            operators: Operators to use for anonymization.
                Operators allow for custom anonymization of detected PII.
                Learn more:
                https://microsoft.github.io/presidio/tutorial/10_simple_anonymization/
        """
        try:
            from presidio_analyzer import AnalyzerEngine
        except ImportError as e:
            raise ImportError(
                "Could not import presidio_analyzer, please install with "
                "`pip install presidio-analyzer`. You will also need to download a "
                "spaCy model to use the analyzer, e.g. "
                "`python -m spacy download en_core_web_lg`."
            ) from e
        try:
            from presidio_anonymizer import AnonymizerEngine
            from presidio_anonymizer.entities import OperatorConfig
        except ImportError as e:
            raise ImportError(
                "Could not import presidio_anonymizer, please install with "
                "`pip install presidio-anonymizer`."
            ) from e

        self.analyzed_fields = (
            analyzed_fields
            if analyzed_fields is not None
            else list(get_pseudoanonymizer_mapping().keys())
        )
        self.operators = (
            operators
            if operators is not None
            else {
                field: OperatorConfig(
                    operator_name="custom", params={"lambda": faker_function}
                )
                for field, faker_function in get_pseudoanonymizer_mapping().items()
            }
        )
        self._analyzer = AnalyzerEngine()
        self._anonymizer = AnonymizerEngine()

    def _anonymize(self, text: str) -> str:
        results = self._analyzer.analyze(
            text,
            entities=self.analyzed_fields,
            language="en",
        )

        return self._anonymizer.anonymize(
            text,
            analyzer_results=results,
            operators=self.operators,
        ).text

    def add_recognizer(self, recognizer: EntityRecognizer) -> None:
        """Add a recognizer to the analyzer"""
        self._analyzer.registry.add_recognizer(recognizer)
        self.analyzed_fields.extend(recognizer.supported_entities)

    def add_operators(self, operators: Dict[str, OperatorConfig]) -> None:
        """Add operators to the anonymizer"""
        self.operators.update(operators)
Add data anonymizer (#9863) ### Description The feature for anonymizing data has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. Anonynization consists of two steps: 1. Identification: Identify all data fields that contain personally identifiable information (PII). 2. Replacement: Replace all PIIs with pseudo values or codes that do not reveal any personal information about the individual but can be used for reference. We're not using regular encryption, because the language model won't be able to understand the meaning or context of the encrypted data. We use Microsoft Presidio together with Faker framework for anonymization purposes because of the wide range of functionalities they provide. The full implementation is available in `PresidioAnonymizer`. ### Future works - deanonymization - add the ability to reverse anonymization. For example, the workflow could look like this: `anonymize -> LLMChain -> deanonymize`. By doing this, we will retain anonymity in requests to, for example, OpenAI, and then be able restore the original data. - instance anonymization - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp <maks.operlejn@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-08-30 17:39:44 +00:00			`from __future__ import annotations`

			`from typing import TYPE_CHECKING, Dict, List, Optional`

			`from langchain_experimental.data_anonymizer.base import AnonymizerBase`
			`from langchain_experimental.data_anonymizer.faker_presidio_mapping import (`
			`get_pseudoanonymizer_mapping,`
			`)`

			`if TYPE_CHECKING:`
			`from presidio_analyzer import EntityRecognizer`
			`from presidio_anonymizer.entities import OperatorConfig`


			`class PresidioAnonymizer(AnonymizerBase):`
			`"""Anonymizer using Microsoft Presidio."""`

			`def __init__(`
			`self,`
			`analyzed_fields: Optional[List[str]] = None,`
			`operators: Optional[Dict[str, OperatorConfig]] = None,`
			`):`
			`"""`
			`Args:`
			`analyzed_fields: List of fields to detect and then anonymize.`
			`Defaults to all entities supported by Microsoft Presidio.`
			`operators: Operators to use for anonymization.`
			`Operators allow for custom anonymization of detected PII.`
			`Learn more:`
			`https://microsoft.github.io/presidio/tutorial/10_simple_anonymization/`
			`"""`
			`try:`
			`from presidio_analyzer import AnalyzerEngine`
			`except ImportError as e:`
			`raise ImportError(`
			`"Could not import presidio_analyzer, please install with "`
			"`pip install presidio-analyzer`. You will also need to download a "
			`"spaCy model to use the analyzer, e.g. "`
			"`python -m spacy download en_core_web_lg`."
			`) from e`
			`try:`
			`from presidio_anonymizer import AnonymizerEngine`
			`from presidio_anonymizer.entities import OperatorConfig`
			`except ImportError as e:`
			`raise ImportError(`
			`"Could not import presidio_anonymizer, please install with "`
			"`pip install presidio-anonymizer`."
			`) from e`

			`self.analyzed_fields = (`
			`analyzed_fields`
			`if analyzed_fields is not None`
			`else list(get_pseudoanonymizer_mapping().keys())`
			`)`
			`self.operators = (`
			`operators`
			`if operators is not None`
			`else {`
			`field: OperatorConfig(`
			`operator_name="custom", params={"lambda": faker_function}`
			`)`
			`for field, faker_function in get_pseudoanonymizer_mapping().items()`
			`}`
			`)`
			`self._analyzer = AnalyzerEngine()`
			`self._anonymizer = AnonymizerEngine()`

			`def _anonymize(self, text: str) -> str:`
			`results = self._analyzer.analyze(`
			`text,`
			`entities=self.analyzed_fields,`
Temporarily remove language selection (#10097) Adapting Microsoft Presidio to other languages requires a bit more work, so for now it will be good idea to remove the language option to choose, so as not to cause errors and confusion. https://microsoft.github.io/presidio/analyzer/languages/ I will handle different languages after the weekend :smile: 2023-09-01 18:30:48 +00:00			`language="en",`
Add data anonymizer (#9863) ### Description The feature for anonymizing data has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. Anonynization consists of two steps: 1. Identification: Identify all data fields that contain personally identifiable information (PII). 2. Replacement: Replace all PIIs with pseudo values or codes that do not reveal any personal information about the individual but can be used for reference. We're not using regular encryption, because the language model won't be able to understand the meaning or context of the encrypted data. We use Microsoft Presidio together with Faker framework for anonymization purposes because of the wide range of functionalities they provide. The full implementation is available in `PresidioAnonymizer`. ### Future works - deanonymization - add the ability to reverse anonymization. For example, the workflow could look like this: `anonymize -> LLMChain -> deanonymize`. By doing this, we will retain anonymity in requests to, for example, OpenAI, and then be able restore the original data. - instance anonymization - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp <maks.operlejn@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-08-30 17:39:44 +00:00			`)`

			`return self._anonymizer.anonymize(`
			`text,`
			`analyzer_results=results,`
			`operators=self.operators,`
			`).text`

			`def add_recognizer(self, recognizer: EntityRecognizer) -> None:`
			`"""Add a recognizer to the analyzer"""`
			`self._analyzer.registry.add_recognizer(recognizer)`
			`self.analyzed_fields.extend(recognizer.supported_entities)`

			`def add_operators(self, operators: Dict[str, OperatorConfig]) -> None:`
			`"""Add operators to the anonymizer"""`
			`self.operators.update(operators)`