Better deanonymizer matching strategy (#11557)

@baskaryan, @hwchase17
1 year ago · 4d62def9ff
parent a992b9670d
commit 4d62def9ff
5 changed files with 893 additions and 510 deletions
--- a/docs/docs_skeleton/docs/guides/privacy/presidio_data_anonymization/reversible.ipynb
+++ b/docs/docs_skeleton/docs/guides/privacy/presidio_data_anonymization/reversible.ipynb
@ -271,7 +271,7 @@
      },
      {
         "cell_type": "code",
-   "execution_count": 8,
+         "execution_count": 1,
         "metadata": {},
         "outputs": [
            {
@ -283,7 +283,7 @@
                     " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861'}}"
                  ]
               },
-     "execution_count": 8,
+               "execution_count": 1,
               "metadata": {},
               "output_type": "execute_result"
            }
@ -315,7 +315,7 @@
      },
      {
         "cell_type": "code",
-   "execution_count": 9,
+         "execution_count": 2,
         "metadata": {},
         "outputs": [
            {
@ -335,7 +335,7 @@
                     "  '3537672423884966': '4001 9192 5753 7193'}}"
                  ]
               },
-     "execution_count": 9,
+               "execution_count": 2,
               "metadata": {},
               "output_type": "execute_result"
            }
@ -359,7 +359,7 @@
      },
      {
         "cell_type": "code",
-   "execution_count": 10,
+         "execution_count": 3,
         "metadata": {},
         "outputs": [
            {
@ -422,7 +422,7 @@
      },
      {
         "cell_type": "code",
-   "execution_count": 12,
+         "execution_count": 4,
         "metadata": {},
         "outputs": [
            {
@ -431,7 +431,7 @@
                     "{}"
                  ]
               },
-     "execution_count": 12,
+               "execution_count": 4,
               "metadata": {},
               "output_type": "execute_result"
            }
@ -444,7 +444,7 @@
      },
      {
         "cell_type": "code",
-   "execution_count": 13,
+         "execution_count": 5,
         "metadata": {},
         "outputs": [
            {
@ -457,7 +457,7 @@
                     "  '3537672423884966': '4001 9192 5753 7193'}}"
                  ]
               },
-     "execution_count": 13,
+               "execution_count": 5,
               "metadata": {},
               "output_type": "execute_result"
            }
@ -468,6 +468,130 @@
            "anonymizer.deanonymizer_mapping"
         ]
      },
+      {
+         "cell_type": "markdown",
+         "metadata": {},
+         "source": [
+            "### Custom deanonymization strategy\n",
+            "\n",
+            "The default deanonymization strategy is to exactly match the substring in the text with the mapping entry. Due to the indeterminism of LLMs, it may be that the model will change the format of the private data slightly or make a typo, for example:\n",
+            "- *Keanu Reeves* -> *Kaenu Reeves*\n",
+            "- *John F. Kennedy* -> *John Kennedy*\n",
+            "- *Main St, New York* -> *New York*\n",
+            "\n",
+            "It is therefore worth considering appropriate prompt engineering (have the model return PII in unchanged format) or trying to implement your replacing strategy. For example, you can use fuzzy matching - this will solve problems with typos and minor changes in the text. Some implementations of the swapping strategy can be found in the file `deanonymizer_matching_strategies.py`."
+         ]
+      },
+      {
+         "cell_type": "code",
+         "execution_count": 6,
+         "metadata": {},
+         "outputs": [
+            {
+               "name": "stdout",
+               "output_type": "stream",
+               "text": [
+                  "maria lynch\n",
+                  "Slim Shady\n"
+               ]
+            }
+         ],
+         "source": [
+            "from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (\n",
+            "    case_insensitive_matching_strategy,\n",
+            ")\n",
+            "\n",
+            "# Original name: Maria Lynch\n",
+            "print(anonymizer.deanonymize(\"maria lynch\"))\n",
+            "print(\n",
+            "    anonymizer.deanonymize(\n",
+            "        \"maria lynch\", deanonymizer_matching_strategy=case_insensitive_matching_strategy\n",
+            "    )\n",
+            ")"
+         ]
+      },
+      {
+         "cell_type": "code",
+         "execution_count": 7,
+         "metadata": {},
+         "outputs": [
+            {
+               "name": "stdout",
+               "output_type": "stream",
+               "text": [
+                  "Call Maria K. Lynch at 734-413-1647\n",
+                  "Call Slim Shady at 313-666-7440\n"
+               ]
+            }
+         ],
+         "source": [
+            "from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (\n",
+            "    fuzzy_matching_strategy,\n",
+            ")\n",
+            "\n",
+            "# Original name: Maria Lynch\n",
+            "# Original phone number: 7344131647 (without dashes)\n",
+            "print(anonymizer.deanonymize(\"Call Maria K. Lynch at 734-413-1647\"))\n",
+            "print(\n",
+            "    anonymizer.deanonymize(\n",
+            "        \"Call Maria K. Lynch at 734-413-1647\",\n",
+            "        deanonymizer_matching_strategy=fuzzy_matching_strategy,\n",
+            "    )\n",
+            ")"
+         ]
+      },
+      {
+         "cell_type": "markdown",
+         "metadata": {},
+         "source": [
+            "It seems that the combined method works best:\n",
+            "- first apply the exact match strategy\n",
+            "- then match the rest using the fuzzy strategy"
+         ]
+      },
+      {
+         "cell_type": "code",
+         "execution_count": 20,
+         "metadata": {},
+         "outputs": [
+            {
+               "name": "stdout",
+               "output_type": "stream",
+               "text": [
+                  "Are you Slim Shady? I found your card with number 4916 0387 9536 0861.\n",
+                  "Is this your phone number: 313-666-7440?\n",
+                  "Is this your email address: wdavis@example.net\n"
+               ]
+            }
+         ],
+         "source": [
+            "from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (\n",
+            "    combined_exact_fuzzy_matching_strategy,\n",
+            ")\n",
+            "\n",
+            "# Changed some values for fuzzy match showcase:\n",
+            "# - \"Maria Lynch\" -> \"Maria K. Lynch\"\n",
+            "# - \"7344131647\" -> \"734-413-1647\"\n",
+            "# - \"213186379402654\" -> \"2131 8637 9402 654\"\n",
+            "print(\n",
+            "    anonymizer.deanonymize(\n",
+            "        (\n",
+            "            \"Are you Maria F. Lynch? I found your card with number 4838 6379 40262.\\n\"\n",
+            "            \"Is this your phone number: 734-413-1647?\\n\"\n",
+            "            \"Is this your email address: wdavis@example.net\"\n",
+            "        ),\n",
+            "        deanonymizer_matching_strategy=combined_exact_fuzzy_matching_strategy,\n",
+            "    )\n",
+            ")"
+         ]
+      },
+      {
+         "cell_type": "markdown",
+         "metadata": {},
+         "source": [
+            "Of course, there is no perfect method and it is worth experimenting and finding the one best suited to your use case."
+         ]
+      },
      {
         "cell_type": "markdown",
         "metadata": {},
--- a/libs/experimental/langchain_experimental/data_anonymizer/base.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/base.py
@ -1,5 +1,12 @@
 from abc import ABC, abstractmethod
-from typing import Optional
+from typing import Callable, Optional
+
+from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType
+from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
+    exact_matching_strategy,
+)
+
+DEFAULT_DEANONYMIZER_MATCHING_STRATEGY = exact_matching_strategy


 class AnonymizerBase(ABC):
@ -23,10 +30,20 @@ class ReversibleAnonymizerBase(AnonymizerBase):
    Base abstract class for reversible anonymizers.
    """

-    def deanonymize(self, text: str) -> str:
+    def deanonymize(
+        self,
+        text_to_deanonymize: str,
+        deanonymizer_matching_strategy: Callable[
+            [str, MappingDataType], str
+        ] = DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
+    ) -> str:
        """Deanonymize text"""
-        return self._deanonymize(text)
+        return self._deanonymize(text_to_deanonymize, deanonymizer_matching_strategy)

    @abstractmethod
-    def _deanonymize(self, text: str) -> str:
+    def _deanonymize(
+        self,
+        text_to_deanonymize: str,
+        deanonymizer_matching_strategy: Callable[[str, MappingDataType], str],
+    ) -> str:
        """Abstract method to deanonymize text"""
--- a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py
@ -1,9 +1,12 @@
-from langchain_experimental.data_anonymizer.presidio import MappingDataType
+import re
+from typing import List

+from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType

-def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:
+
+def exact_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:
    """
-    Default matching strategy for deanonymization.
+    Exact matching strategy for deanonymization.
    It replaces all the anonymized entities with the original ones.

    Args:
@ -15,3 +18,168 @@ def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType)
        for anonymized, original in deanonymizer_mapping[entity_type].items():
            text = text.replace(anonymized, original)
    return text
+
+
+def case_insensitive_matching_strategy(
+    text: str, deanonymizer_mapping: MappingDataType
+) -> str:
+    """
+    Case insensitive matching strategy for deanonymization.
+    It replaces all the anonymized entities with the original ones
+        irrespective of their letter case.
+
+    Args:
+        text: text to deanonymize
+        deanonymizer_mapping: mapping between anonymized entities and original ones
+
+    Examples of matching:
+        keanu reeves -> Keanu Reeves
+        JOHN F. KENNEDY -> John F. Kennedy
+    """
+
+    # Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)
+    for entity_type in deanonymizer_mapping:
+        for anonymized, original in deanonymizer_mapping[entity_type].items():
+            # Use regular expressions for case-insensitive matching and replacing
+            text = re.sub(anonymized, original, text, flags=re.IGNORECASE)
+    return text
+
+
+def fuzzy_matching_strategy(
+    text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3
+) -> str:
+    """
+    Fuzzy matching strategy for deanonymization.
+    It uses fuzzy matching to find the position of the anonymized entity in the text.
+    It replaces all the anonymized entities with the original ones.
+
+    Args:
+        text: text to deanonymize
+        deanonymizer_mapping: mapping between anonymized entities and original ones
+        max_l_dist: maximum Levenshtein distance between the anonymized entity and the
+            text segment to consider it a match
+
+    Examples of matching:
+        Kaenu Reves -> Keanu Reeves
+        John F. Kennedy -> John Kennedy
+    """
+
+    try:
+        from fuzzysearch import find_near_matches
+    except ImportError as e:
+        raise ImportError(
+            "Could not import fuzzysearch, please install with "
+            "`pip install fuzzysearch`."
+        ) from e
+
+    for entity_type in deanonymizer_mapping:
+        for anonymized, original in deanonymizer_mapping[entity_type].items():
+            matches = find_near_matches(anonymized, text, max_l_dist=max_l_dist)
+            new_text = ""
+            last_end = 0
+            for m in matches:
+                # add the text that isn't part of a match
+                new_text += text[last_end : m.start]
+                # add the replacement text
+                new_text += original
+                last_end = m.end
+            # add the remaining text that wasn't part of a match
+            new_text += text[last_end:]
+            text = new_text
+
+    return text
+
+
+def combined_exact_fuzzy_matching_strategy(
+    text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3
+) -> str:
+    """
+    RECOMMENDED STRATEGY.
+    Combined exact and fuzzy matching strategy for deanonymization.
+
+    Args:
+        text: text to deanonymize
+        deanonymizer_mapping: mapping between anonymized entities and original ones
+        max_l_dist: maximum Levenshtein distance between the anonymized entity and the
+            text segment to consider it a match
+
+    Examples of matching:
+        Kaenu Reves -> Keanu Reeves
+        John F. Kennedy -> John Kennedy
+    """
+    text = exact_matching_strategy(text, deanonymizer_mapping)
+    text = fuzzy_matching_strategy(text, deanonymizer_mapping, max_l_dist)
+    return text
+
+
+def ngram_fuzzy_matching_strategy(
+    text: str,
+    deanonymizer_mapping: MappingDataType,
+    fuzzy_threshold: int = 85,
+    use_variable_length: bool = True,
+) -> str:
+    """
+    N-gram fuzzy matching strategy for deanonymization.
+    It replaces all the anonymized entities with the original ones.
+    It uses fuzzy matching to find the position of the anonymized entity in the text.
+    It generates n-grams of the same length as the anonymized entity from the text and
+    uses fuzzy matching to find the position of the anonymized entity in the text.
+
+    Args:
+        text: text to deanonymize
+        deanonymizer_mapping: mapping between anonymized entities and original ones
+        fuzzy_threshold: fuzzy matching threshold
+        use_variable_length: whether to use (n-1, n, n+1)-grams or just n-grams
+    """
+
+    def generate_ngrams(words_list: List[str], n: int) -> list:
+        """Generate n-grams from a list of words"""
+        return [
+            " ".join(words_list[i : i + n]) for i in range(len(words_list) - (n - 1))
+        ]
+
+    try:
+        from fuzzywuzzy import fuzz
+    except ImportError as e:
+        raise ImportError(
+            "Could not import fuzzywuzzy, please install with "
+            "`pip install fuzzywuzzy`."
+        ) from e
+
+    text_words = text.split()
+    replacements = []
+    matched_indices: List[int] = []
+
+    for entity_type in deanonymizer_mapping:
+        for anonymized, original in deanonymizer_mapping[entity_type].items():
+            anonymized_words = anonymized.split()
+
+            if use_variable_length:
+                gram_lengths = [
+                    len(anonymized_words) - 1,
+                    len(anonymized_words),
+                    len(anonymized_words) + 1,
+                ]
+            else:
+                gram_lengths = [len(anonymized_words)]
+            for n in gram_lengths:
+                if n > 0:  # Take only positive values
+                    segments = generate_ngrams(text_words, n)
+                    for i, segment in enumerate(segments):
+                        if (
+                            fuzz.ratio(anonymized.lower(), segment.lower())
+                            > fuzzy_threshold
+                            and i not in matched_indices
+                        ):
+                            replacements.append((i, n, original))
+                            # Add the matched segment indices to the list
+                            matched_indices.extend(range(i, i + n))
+
+    # Sort replacements by index in reverse order
+    replacements.sort(key=lambda x: x[0], reverse=True)
+
+    # Apply replacements in reverse order to not affect subsequent indices
+    for start, length, replacement in replacements:
+        text_words[start : start + length] = replacement.split()
+
+    return " ".join(text_words)
--- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
+++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py
@ -7,6 +7,7 @@ from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
 import yaml

 from langchain_experimental.data_anonymizer.base import (
+    DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
    AnonymizerBase,
    ReversibleAnonymizerBase,
 )
@ -16,7 +17,7 @@ from langchain_experimental.data_anonymizer.deanonymizer_mapping import (
    create_anonymizer_mapping,
 )
 from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
-    default_matching_strategy,
+    exact_matching_strategy,
 )
 from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
    get_pseudoanonymizer_mapping,
@ -190,7 +191,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
            filtered_analyzer_results,
            anonymizer_results,
        )
-        return default_matching_strategy(text, anonymizer_mapping)
+        return exact_matching_strategy(text, anonymizer_mapping)


 class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase):
@ -282,14 +283,14 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
        )
        self._deanonymizer_mapping.update(new_deanonymizer_mapping)

-        return default_matching_strategy(text, self.anonymizer_mapping)
+        return exact_matching_strategy(text, self.anonymizer_mapping)

    def _deanonymize(
        self,
        text_to_deanonymize: str,
        deanonymizer_matching_strategy: Callable[
            [str, MappingDataType], str
-        ] = default_matching_strategy,
+        ] = DEFAULT_DEANONYMIZER_MATCHING_STRATEGY,
    ) -> str:
        """Deanonymize text.
        Each anonymized entity is replaced with its original value.
--- a/libs/experimental/tests/unit_tests/test_data_anonymizer.py
+++ b/libs/experimental/tests/unit_tests/test_data_anonymizer.py
@ -126,3 +126,76 @@ def test_non_faker_values() -> None:
    anonymizer = PresidioAnonymizer(add_default_faker_operators=False)
    anonymized_text = anonymizer.anonymize(text)
    assert anonymized_text == expected_result
+
+
+@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
+def test_exact_matching_strategy() -> None:
+    """
+    Test exact matching strategy for deanonymization.
+    """
+    from langchain_experimental.data_anonymizer import (
+        deanonymizer_matching_strategies as dms,
+    )
+
+    deanonymizer_mapping = {
+        "PERSON": {"Maria Lynch": "Slim Shady"},
+        "PHONE_NUMBER": {"7344131647": "313-666-7440"},
+        "EMAIL_ADDRESS": {"wdavis@example.net": "real.slim.shady@gmail.com"},
+        "CREDIT_CARD": {"213186379402654": "4916 0387 9536 0861"},
+    }
+
+    text = (
+        "Are you Maria Lynch? I found your card with number 213186379402654. "
+        "Is this your phone number: 7344131647? "
+        "Is this your email address: wdavis@example.net"
+    )
+
+    deanonymized_text = dms.exact_matching_strategy(text, deanonymizer_mapping)
+
+    for original_value in [
+        "Slim Shady",
+        "313-666-7440",
+        "real.slim.shady@gmail.com",
+        "4916 0387 9536 0861",
+    ]:
+        assert original_value in deanonymized_text
+
+
+@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
+def test_best_matching_strategy() -> None:
+    """
+    Test exact matching strategy for deanonymization.
+    """
+    from langchain_experimental.data_anonymizer import (
+        deanonymizer_matching_strategies as dms,
+    )
+
+    deanonymizer_mapping = {
+        "PERSON": {"Maria Lynch": "Slim Shady"},
+        "PHONE_NUMBER": {"7344131647": "313-666-7440"},
+        "EMAIL_ADDRESS": {"wdavis@example.net": "real.slim.shady@gmail.com"},
+        "CREDIT_CARD": {"213186379402654": "4916 0387 9536 0861"},
+    }
+
+    # Changed some values:
+    # - "Maria Lynch" -> "Maria K. Lynch"
+    # - "7344131647" -> "734-413-1647"
+    # - "213186379402654" -> "2131 8637 9402 654"
+    # - "wdavis@example.net" -> the same to test exact match
+    text = (
+        "Are you Maria K. Lynch? I found your card with number 2131 8637 9402 654. "
+        "Is this your phone number: 734-413-1647?"
+        "Is this your email address: wdavis@example.net"
+    )
+
+    deanonymized_text = dms.combined_exact_fuzzy_matching_strategy(
+        text, deanonymizer_mapping
+    )
+
+    for original_value in [
+        "Slim Shady",
+        "313-666-7440",
+        "real.slim.shady@gmail.com",
+        "4916 0387 9536 0861",
+    ]:
+        assert original_value in deanonymized_text