langchain/libs/experimental/langchain_experimental/data_anonymizer/presidio.py

from __future__ import annotations

from typing import TYPE_CHECKING, Dict, List, Optional

from langchain_experimental.data_anonymizer.base import AnonymizerBase
from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
    get_pseudoanonymizer_mapping,
)

if TYPE_CHECKING:
    from presidio_analyzer import EntityRecognizer
    from presidio_anonymizer.entities import OperatorConfig


class PresidioAnonymizer(AnonymizerBase):
    """Anonymizer using Microsoft Presidio."""

    def __init__(
        self,
        analyzed_fields: Optional[List[str]] = None,
        language: str = "en",
        operators: Optional[Dict[str, OperatorConfig]] = None,
    ):
        """
        Args:
            analyzed_fields: List of fields to detect and then anonymize.
                Defaults to all entities supported by Microsoft Presidio.
            language: Language to use for analysis. Defaults to english.
            operators: Operators to use for anonymization.
                Operators allow for custom anonymization of detected PII.
                Learn more:
                https://microsoft.github.io/presidio/tutorial/10_simple_anonymization/
        """
        try:
            from presidio_analyzer import AnalyzerEngine
        except ImportError as e:
            raise ImportError(
                "Could not import presidio_analyzer, please install with "
                "`pip install presidio-analyzer`. You will also need to download a "
                "spaCy model to use the analyzer, e.g. "
                "`python -m spacy download en_core_web_lg`."
            ) from e
        try:
            from presidio_anonymizer import AnonymizerEngine
            from presidio_anonymizer.entities import OperatorConfig
        except ImportError as e:
            raise ImportError(
                "Could not import presidio_anonymizer, please install with "
                "`pip install presidio-anonymizer`."
            ) from e

        self.analyzed_fields = (
            analyzed_fields
            if analyzed_fields is not None
            else list(get_pseudoanonymizer_mapping().keys())
        )
        self.language = language
        self.operators = (
            operators
            if operators is not None
            else {
                field: OperatorConfig(
                    operator_name="custom", params={"lambda": faker_function}
                )
                for field, faker_function in get_pseudoanonymizer_mapping().items()
            }
        )
        self._analyzer = AnalyzerEngine()
        self._anonymizer = AnonymizerEngine()

    def _anonymize(self, text: str) -> str:
        results = self._analyzer.analyze(
            text,
            entities=self.analyzed_fields,
            language=self.language,
        )

        return self._anonymizer.anonymize(
            text,
            analyzer_results=results,
            operators=self.operators,
        ).text

    def add_recognizer(self, recognizer: EntityRecognizer) -> None:
        """Add a recognizer to the analyzer"""
        self._analyzer.registry.add_recognizer(recognizer)
        self.analyzed_fields.extend(recognizer.supported_entities)

    def add_operators(self, operators: Dict[str, OperatorConfig]) -> None:
        """Add operators to the anonymizer"""
        self.operators.update(operators)