Better deanonymizer matching strategy (#11557)

@baskaryan, @hwchase17
9 months ago · 4d62def9ff
parent a992b9670d
commit 4d62def9ff
5 changed files with 893 additions and 510 deletions
--- a/docs/docs_skeleton/docs/guides/privacy/presidio_data_anonymization/reversible.ipynb
+++ b/docs/docs_skeleton/docs/guides/privacy/presidio_data_anonymization/reversible.ipynb
--- a/libs/experimental/langchain_experimental/data_anonymizer/base.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/base.py
@ -1,5 +1,12 @@
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import Callable, Optional
+
+from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType
+from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
+    exact_matching_strategy,
+)
+
+DEFAULT_DEANONYMIZER_MATCHING_STRATEGY = exact_matching_strategy


 class AnonymizerBase(ABC):
@ -23,10 +30,20 @@ class ReversibleAnonymizerBase(AnonymizerBase):
    Base abstract class for reversible anonymizers.
    """

-    def deanonymize(self, text: str) -> str:
+    def deanonymize(
+        self,
+        text_to_deanonymize: str,
+        deanonymizer_matching_strategy: Callable[
+            [str, MappingDataType], str
+        ] = DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
+    ) -> str:
        """Deanonymize text"""
-        return self._deanonymize(text)
+        return self._deanonymize(text_to_deanonymize, deanonymizer_matching_strategy)

    @abstractmethod
-    def _deanonymize(self, text: str) -> str:
+    def _deanonymize(
+        self,
+        text_to_deanonymize: str,
+        deanonymizer_matching_strategy: Callable[[str, MappingDataType], str],
+    ) -> str:
        """Abstract method to deanonymize text"""
--- a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
@ -1,9 +1,12 @@
-from langchain_experimental.data_anonymizer.presidio import MappingDataType
+import re
+from typing import List

+from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType

-def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:
+
+def exact_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:
    """
-    Default matching strategy for deanonymization.
+    Exact matching strategy for deanonymization.
    It replaces all the anonymized entities with the original ones.

    Args:
@ -15,3 +18,168 @@ def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType)
        for anonymized, original in deanonymizer_mapping[entity_type].items():
            text = text.replace(anonymized, original)
    return text
+
+
+def case_insensitive_matching_strategy(
+    text: str, deanonymizer_mapping: MappingDataType
+) -> str:
+    """
+    Case insensitive matching strategy for deanonymization.
+    It replaces all the anonymized entities with the original ones
+        irrespective of their letter case.
+
+    Args:
+        text: text to deanonymize
+        deanonymizer_mapping: mapping between anonymized entities and original ones
+
+    Examples of matching:
+        keanu reeves -> Keanu Reeves
+        JOHN F. KENNEDY -> John F. Kennedy
+    """
+
+    # Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)
+    for entity_type in deanonymizer_mapping:
+        for anonymized, original in deanonymizer_mapping[entity_type].items():
+            # Use regular expressions for case-insensitive matching and replacing
+            text = re.sub(anonymized, original, text, flags=re.IGNORECASE)
+    return text
+
+
+def fuzzy_matching_strategy(
+    text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3
+) -> str:
+    """
+    Fuzzy matching strategy for deanonymization.
+    It uses fuzzy matching to find the position of the anonymized entity in the text.
+    It replaces all the anonymized entities with the original ones.
+
+    Args:
+        text: text to deanonymize
+        deanonymizer_mapping: mapping between anonymized entities and original ones
+        max_l_dist: maximum Levenshtein distance between the anonymized entity and the
+            text segment to consider it a match
+
+    Examples of matching:
+        Kaenu Reves -> Keanu Reeves
+        John F. Kennedy -> John Kennedy
+    """
+
+    try:
+        from fuzzysearch import find_near_matches
+    except ImportError as e:
+        raise ImportError(
+            "Could not import fuzzysearch, please install with "
+            "`pip install fuzzysearch`."
+        ) from e
+
+    for entity_type in deanonymizer_mapping:
+        for anonymized, original in deanonymizer_mapping[entity_type].items():
+            matches = find_near_matches(anonymized, text, max_l_dist=max_l_dist)
+            new_text = ""
+            last_end = 0
+            for m in matches:
+                # add the text that isn't part of a match
+                new_text += text[last_end : m.start]
+                # add the replacement text
+                new_text += original
+                last_end = m.end
+            # add the remaining text that wasn't part of a match
+            new_text += text[last_end:]
+            text = new_text
+
+    return text
+
+
+def combined_exact_fuzzy_matching_strategy(
+    text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3
+) -> str:
+    """
+    RECOMMENDED STRATEGY.
+    Combined exact and fuzzy matching strategy for deanonymization.
+
+    Args:
+        text: text to deanonymize
+        deanonymizer_mapping: mapping between anonymized entities and original ones
+        max_l_dist: maximum Levenshtein distance between the anonymized entity and the
+            text segment to consider it a match
+
+    Examples of matching:
+        Kaenu Reves -> Keanu Reeves
+        John F. Kennedy -> John Kennedy
+    """
+    text = exact_matching_strategy(text, deanonymizer_mapping)
+    text = fuzzy_matching_strategy(text, deanonymizer_mapping, max_l_dist)
+    return text
+
+
+def ngram_fuzzy_matching_strategy(
+    text: str,
+    deanonymizer_mapping: MappingDataType,
+    fuzzy_threshold: int = 85,
+    use_variable_length: bool = True,
+) -> str:
+    """
+    N-gram fuzzy matching strategy for deanonymization.
+    It replaces all the anonymized entities with the original ones.
+    It uses fuzzy matching to find the position of the anonymized entity in the text.
+    It generates n-grams of the same length as the anonymized entity from the text and
+    uses fuzzy matching to find the position of the anonymized entity in the text.
+
+    Args:
+        text: text to deanonymize
+        deanonymizer_mapping: mapping between anonymized entities and original ones
+        fuzzy_threshold: fuzzy matching threshold
+        use_variable_length: whether to use (n-1, n, n+1)-grams or just n-grams
+    """
+
+    def generate_ngrams(words_list: List[str], n: int) -> list:
+        """Generate n-grams from a list of words"""
+        return [
+            " ".join(words_list[i : i + n]) for i in range(len(words_list) - (n - 1))
+        ]
+
+    try:
+        from fuzzywuzzy import fuzz
+    except ImportError as e:
+        raise ImportError(
+            "Could not import fuzzywuzzy, please install with "
+            "`pip install fuzzywuzzy`."
+        ) from e
+
+    text_words = text.split()
+    replacements = []
+    matched_indices: List[int] = []
+
+    for entity_type in deanonymizer_mapping:
+        for anonymized, original in deanonymizer_mapping[entity_type].items():
+            anonymized_words = anonymized.split()
+
+            if use_variable_length:
+                gram_lengths = [
+                    len(anonymized_words) - 1,
+                    len(anonymized_words),
+                    len(anonymized_words) + 1,
+                ]
+            else:
+                gram_lengths = [len(anonymized_words)]
+            for n in gram_lengths:
+                if n > 0:  # Take only positive values
+                    segments = generate_ngrams(text_words, n)
+                    for i, segment in enumerate(segments):
+                        if (
+                            fuzz.ratio(anonymized.lower(), segment.lower())
+                            > fuzzy_threshold
+                            and i not in matched_indices
+                        ):
+                            replacements.append((i, n, original))
+                            # Add the matched segment indices to the list
+                            matched_indices.extend(range(i, i + n))
+
+    # Sort replacements by index in reverse order
+    replacements.sort(key=lambda x: x[0], reverse=True)
+
+    # Apply replacements in reverse order to not affect subsequent indices
+    for start, length, replacement in replacements:
+        text_words[start : start + length] = replacement.split()
+
+    return " ".join(text_words)
--- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
 import yaml

 from langchain_experimental.data_anonymizer.base import (
+    DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
    AnonymizerBase,
    ReversibleAnonymizerBase,
 )
@ -16,7 +17,7 @@ from langchain_experimental.data_anonymizer.deanonymizer_mapping import (
    create_anonymizer_mapping,
 )
 from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
-    default_matching_strategy,
+    exact_matching_strategy,
 )
 from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
    get_pseudoanonymizer_mapping,
@ -190,7 +191,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
            filtered_analyzer_results,
            anonymizer_results,
        )
-        return default_matching_strategy(text, anonymizer_mapping)
+        return exact_matching_strategy(text, anonymizer_mapping)


 class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase):
@ -282,14 +283,14 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
        )
        self._deanonymizer_mapping.update(new_deanonymizer_mapping)

-        return default_matching_strategy(text, self.anonymizer_mapping)
+        return exact_matching_strategy(text, self.anonymizer_mapping)

    def _deanonymize(
        self,
        text_to_deanonymize: str,
        deanonymizer_matching_strategy: Callable[
            [str, MappingDataType], str
-        ] = default_matching_strategy,
+        ] = DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
    ) -> str:
        """Deanonymize text.
        Each anonymized entity is replaced with its original value.
--- a/libs/experimental/tests/unit_tests/test_data_anonymizer.py
+++ b/libs/experimental/tests/unit_tests/test_data_anonymizer.py
@ -126,3 +126,76 @@ def test_non_faker_values() -> None:
    anonymizer = PresidioAnonymizer(add_default_faker_operators=False)
    anonymized_text = anonymizer.anonymize(text)
    assert anonymized_text == expected_result
+
+
+@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
+def test_exact_matching_strategy() -> None:
+    """
+    Test exact matching strategy for deanonymization.
+    """
+    from langchain_experimental.data_anonymizer import (
+        deanonymizer_matching_strategies as dms,
+    )
+
+    deanonymizer_mapping = {
+        "PERSON": {"Maria Lynch": "Slim Shady"},
+        "PHONE_NUMBER": {"7344131647": "313-666-7440"},
+        "EMAIL_ADDRESS": {"wdavis@example.net": "real.slim.shady@gmail.com"},
+        "CREDIT_CARD": {"213186379402654": "4916 0387 9536 0861"},
+    }
+
+    text = (
+        "Are you Maria Lynch? I found your card with number 213186379402654. "
+        "Is this your phone number: 7344131647? "
+        "Is this your email address: wdavis@example.net"
+    )
+
+    deanonymized_text = dms.exact_matching_strategy(text, deanonymizer_mapping)
+
+    for original_value in [
+        "Slim Shady",
+        "313-666-7440",
+        "real.slim.shady@gmail.com",
+        "4916 0387 9536 0861",
+    ]:
+        assert original_value in deanonymized_text
+
+
+@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
+def test_best_matching_strategy() -> None:
+    """
+    Test exact matching strategy for deanonymization.
+    """
+    from langchain_experimental.data_anonymizer import (
+        deanonymizer_matching_strategies as dms,
+    )
+
+    deanonymizer_mapping = {
+        "PERSON": {"Maria Lynch": "Slim Shady"},
+        "PHONE_NUMBER": {"7344131647": "313-666-7440"},
+        "EMAIL_ADDRESS": {"wdavis@example.net": "real.slim.shady@gmail.com"},
+        "CREDIT_CARD": {"213186379402654": "4916 0387 9536 0861"},
+    }
+
+    # Changed some values:
+    # - "Maria Lynch" -> "Maria K. Lynch"
+    # - "7344131647" -> "734-413-1647"
+    # - "213186379402654" -> "2131 8637 9402 654"
+    # - "wdavis@example.net" -> the same to test exact match
+    text = (
+        "Are you Maria K. Lynch? I found your card with number 2131 8637 9402 654. "
+        "Is this your phone number: 734-413-1647?"
+        "Is this your email address: wdavis@example.net"
+    )
+
+    deanonymized_text = dms.combined_exact_fuzzy_matching_strategy(
+        text, deanonymizer_mapping
+    )
+
+    for original_value in [
+        "Slim Shady",
+        "313-666-7440",
+        "real.slim.shady@gmail.com",
+        "4916 0387 9536 0861",
+    ]:
+        assert original_value in deanonymized_text