langchain-experimental: Add allow_list support in experimental/data_anonymizer (#11597)

- **Description:** Add allow_list support in langchain experimental data-anonymizer package - **Issue:** no - **Dependencies:** no - **Tag maintainer:** @hwchase17 - **Twitter handle:**
12 months ago · 70f7558db2
parent 2363c02cf3
commit 70f7558db2
4 changed files with 55 additions and 6 deletions
--- a/libs/experimental/langchain_experimental/data_anonymizer/base.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/base.py
@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Callable, Optional
+from typing import Callable, List, Optional

 from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType
 from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
@ -16,12 +16,19 @@ class AnonymizerBase(ABC):
        wrapping the behavior for all methods in a base class.
    """

-    def anonymize(self, text: str, language: Optional[str] = None) -> str:
+    def anonymize(
+        self,
+        text: str,
+        language: Optional[str] = None,
+        allow_list: Optional[List[str]] = None,
+    ) -> str:
        """Anonymize text"""
-        return self._anonymize(text, language)
+        return self._anonymize(text, language, allow_list)

    @abstractmethod
-    def _anonymize(self, text: str, language: Optional[str]) -> str:
+    def _anonymize(
+        self, text: str, language: Optional[str], allow_list: Optional[List[str]] = None
+    ) -> str:
        """Abstract method to anonymize text"""


--- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
@ -139,7 +139,12 @@ class PresidioAnonymizerBase(AnonymizerBase):


 class PresidioAnonymizer(PresidioAnonymizerBase):
-    def _anonymize(self, text: str, language: Optional[str] = None) -> str:
+    def _anonymize(
+        self,
+        text: str,
+        language: Optional[str] = None,
+        allow_list: Optional[List[str]] = None,
+    ) -> str:
        """Anonymize text.
        Each PII entity is replaced with a fake value.
        Each time fake values will be different, as they are generated randomly.
@ -172,6 +177,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
            text,
            entities=self.analyzed_fields,
            language=language,
+            allow_list=allow_list,
        )

        filtered_analyzer_results = (
@ -226,7 +232,12 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
            for key, inner_dict in self.deanonymizer_mapping.items()
        }

-    def _anonymize(self, text: str, language: Optional[str] = None) -> str:
+    def _anonymize(
+        self,
+        text: str,
+        language: Optional[str] = None,
+        allow_list: Optional[List[str]] = None,
+    ) -> str:
        """Anonymize text.
        Each PII entity is replaced with a fake value.
        Each time fake values will be different, as they are generated randomly.
@ -261,6 +272,7 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
            text,
            entities=self.analyzed_fields,
            language=language,
+            allow_list=allow_list,
        )

        filtered_analyzer_results = (
--- a/libs/experimental/tests/unit_tests/test_data_anonymizer.py
+++ b/libs/experimental/tests/unit_tests/test_data_anonymizer.py
@ -36,6 +36,21 @@ def test_anonymize(analyzed_fields: List[str], should_contain: bool) -> None:
    assert ("John Doe" in anonymized_text) == should_contain


+@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
+@pytest.mark.parametrize(
+    "analyzed_fields,should_contain",
+    [(["PERSON"], True), (["PHONE_NUMBER"], True), (None, True)],
+)
+def test_anonymize_allow_list(analyzed_fields: List[str], should_contain: bool) -> None:
+    """Test anonymizing a name in a simple sentence"""
+    from langchain_experimental.data_anonymizer import PresidioAnonymizer
+
+    text = "Hello, my name is John Doe."
+    anonymizer = PresidioAnonymizer(analyzed_fields=analyzed_fields)
+    anonymized_text = anonymizer.anonymize(text, allow_list=["John Doe"])
+    assert ("John Doe" in anonymized_text) == should_contain
+
+
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
 def test_anonymize_multiple() -> None:
    """Test anonymizing multiple items in a sentence"""
--- a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py
+++ b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py
@ -37,6 +37,21 @@ def test_anonymize(analyzed_fields: List[str], should_contain: bool) -> None:
    assert ("John Doe" in anonymized_text) == should_contain


+@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
+@pytest.mark.parametrize(
+    "analyzed_fields,should_contain",
+    [(["PERSON"], True), (["PHONE_NUMBER"], True), (None, True)],
+)
+def test_anonymize_allow_list(analyzed_fields: List[str], should_contain: bool) -> None:
+    """Test anonymizing a name in a simple sentence"""
+    from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
+
+    text = "Hello, my name is John Doe."
+    anonymizer = PresidioReversibleAnonymizer(analyzed_fields=analyzed_fields)
+    anonymized_text = anonymizer.anonymize(text, allow_list=["John Doe"])
+    assert ("John Doe" in anonymized_text) == should_contain
+
+
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
 def test_anonymize_multiple() -> None:
    """Test anonymizing multiple items in a sentence"""