langchain/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py

import re
from typing import List

from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType


def exact_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:
    """
    Exact matching strategy for deanonymization.
    It replaces all the anonymized entities with the original ones.

    Args:
        text: text to deanonymize
        deanonymizer_mapping: mapping between anonymized entities and original ones"""

    # Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)
    for entity_type in deanonymizer_mapping:
        for anonymized, original in deanonymizer_mapping[entity_type].items():
            text = text.replace(anonymized, original)
    return text


def case_insensitive_matching_strategy(
    text: str, deanonymizer_mapping: MappingDataType
) -> str:
    """
    Case insensitive matching strategy for deanonymization.
    It replaces all the anonymized entities with the original ones
        irrespective of their letter case.

    Args:
        text: text to deanonymize
        deanonymizer_mapping: mapping between anonymized entities and original ones

    Examples of matching:
        keanu reeves -> Keanu Reeves
        JOHN F. KENNEDY -> John F. Kennedy
    """

    # Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)
    for entity_type in deanonymizer_mapping:
        for anonymized, original in deanonymizer_mapping[entity_type].items():
            # Use regular expressions for case-insensitive matching and replacing
            text = re.sub(anonymized, original, text, flags=re.IGNORECASE)
    return text


def fuzzy_matching_strategy(
    text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3
) -> str:
    """
    Fuzzy matching strategy for deanonymization.
    It uses fuzzy matching to find the position of the anonymized entity in the text.
    It replaces all the anonymized entities with the original ones.

    Args:
        text: text to deanonymize
        deanonymizer_mapping: mapping between anonymized entities and original ones
        max_l_dist: maximum Levenshtein distance between the anonymized entity and the
            text segment to consider it a match

    Examples of matching:
        Kaenu Reves -> Keanu Reeves
        John F. Kennedy -> John Kennedy
    """

    try:
        from fuzzysearch import find_near_matches
    except ImportError as e:
        raise ImportError(
            "Could not import fuzzysearch, please install with "
            "`pip install fuzzysearch`."
        ) from e

    for entity_type in deanonymizer_mapping:
        for anonymized, original in deanonymizer_mapping[entity_type].items():
            matches = find_near_matches(anonymized, text, max_l_dist=max_l_dist)
            new_text = ""
            last_end = 0
            for m in matches:
                # add the text that isn't part of a match
                new_text += text[last_end : m.start]
                # add the replacement text
                new_text += original
                last_end = m.end
            # add the remaining text that wasn't part of a match
            new_text += text[last_end:]
            text = new_text

    return text


def combined_exact_fuzzy_matching_strategy(
    text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3
) -> str:
    """
    RECOMMENDED STRATEGY.
    Combined exact and fuzzy matching strategy for deanonymization.

    Args:
        text: text to deanonymize
        deanonymizer_mapping: mapping between anonymized entities and original ones
        max_l_dist: maximum Levenshtein distance between the anonymized entity and the
            text segment to consider it a match

    Examples of matching:
        Kaenu Reves -> Keanu Reeves
        John F. Kennedy -> John Kennedy
    """
    text = exact_matching_strategy(text, deanonymizer_mapping)
    text = fuzzy_matching_strategy(text, deanonymizer_mapping, max_l_dist)
    return text


def ngram_fuzzy_matching_strategy(
    text: str,
    deanonymizer_mapping: MappingDataType,
    fuzzy_threshold: int = 85,
    use_variable_length: bool = True,
) -> str:
    """
    N-gram fuzzy matching strategy for deanonymization.
    It replaces all the anonymized entities with the original ones.
    It uses fuzzy matching to find the position of the anonymized entity in the text.
    It generates n-grams of the same length as the anonymized entity from the text and
    uses fuzzy matching to find the position of the anonymized entity in the text.

    Args:
        text: text to deanonymize
        deanonymizer_mapping: mapping between anonymized entities and original ones
        fuzzy_threshold: fuzzy matching threshold
        use_variable_length: whether to use (n-1, n, n+1)-grams or just n-grams
    """

    def generate_ngrams(words_list: List[str], n: int) -> list:
        """Generate n-grams from a list of words"""
        return [
            " ".join(words_list[i : i + n]) for i in range(len(words_list) - (n - 1))
        ]

    try:
        from fuzzywuzzy import fuzz
    except ImportError as e:
        raise ImportError(
            "Could not import fuzzywuzzy, please install with "
            "`pip install fuzzywuzzy`."
        ) from e

    text_words = text.split()
    replacements = []
    matched_indices: List[int] = []

    for entity_type in deanonymizer_mapping:
        for anonymized, original in deanonymizer_mapping[entity_type].items():
            anonymized_words = anonymized.split()

            if use_variable_length:
                gram_lengths = [
                    len(anonymized_words) - 1,
                    len(anonymized_words),
                    len(anonymized_words) + 1,
                ]
            else:
                gram_lengths = [len(anonymized_words)]
            for n in gram_lengths:
                if n > 0:  # Take only positive values
                    segments = generate_ngrams(text_words, n)
                    for i, segment in enumerate(segments):
                        if (
                            fuzz.ratio(anonymized.lower(), segment.lower())
                            > fuzzy_threshold
                            and i not in matched_indices
                        ):
                            replacements.append((i, n, original))
                            # Add the matched segment indices to the list
                            matched_indices.extend(range(i, i + n))

    # Sort replacements by index in reverse order
    replacements.sort(key=lambda x: x[0], reverse=True)

    # Apply replacements in reverse order to not affect subsequent indices
    for start, length, replacement in replacements:
        text_words[start : start + length] = replacement.split()

    return " ".join(text_words)
Better deanonymizer matching strategy (#11557) @baskaryan, @hwchase17 2023-10-09 18:10:29 +00:00			`import re`
			`from typing import List`
Data deanonymization (#10093) ### Description The feature for pseudonymizing data with ability to retrieve original text (deanonymization) has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. But then, after the model response, it would be good to have the data in the original form. I implemented the `PresidioReversibleAnonymizer`, which consists of two parts: 1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example: ``` { "PERSON": { "<anonymized>": "<original>", "John Doe": "Slim Shady" }, "PHONE_NUMBER": { "111-111-1111": "555-555-5555" } ... } ``` 2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it. Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM. ### Future works - instance anonymization - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. - better matching and substitution of fake values for real ones - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. John Doe -> John or Main St, New York -> New York) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs. - Q&A with anonymization - when I'm done writing all the functionality, I thought it would be a cool resource in documentation to write a notebook about retrieval from documents using anonymization. An iterative process, adding new recognizers to fit the data, lessons learned and what to look out for ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp <maks.operlejn@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-09-07 04:33:24 +00:00
Better deanonymizer matching strategy (#11557) @baskaryan, @hwchase17 2023-10-09 18:10:29 +00:00			`from langchain_experimental.data_anonymizer.deanonymizer_mapping import MappingDataType`
Data deanonymization (#10093) ### Description The feature for pseudonymizing data with ability to retrieve original text (deanonymization) has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. But then, after the model response, it would be good to have the data in the original form. I implemented the `PresidioReversibleAnonymizer`, which consists of two parts: 1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example: ``` { "PERSON": { "<anonymized>": "<original>", "John Doe": "Slim Shady" }, "PHONE_NUMBER": { "111-111-1111": "555-555-5555" } ... } ``` 2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it. Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM. ### Future works - instance anonymization - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. - better matching and substitution of fake values for real ones - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. John Doe -> John or Main St, New York -> New York) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs. - Q&A with anonymization - when I'm done writing all the functionality, I thought it would be a cool resource in documentation to write a notebook about retrieval from documents using anonymization. An iterative process, adding new recognizers to fit the data, lessons learned and what to look out for ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp <maks.operlejn@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-09-07 04:33:24 +00:00
Better deanonymizer matching strategy (#11557) @baskaryan, @hwchase17 2023-10-09 18:10:29 +00:00
			`def exact_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:`
Data deanonymization (#10093) ### Description The feature for pseudonymizing data with ability to retrieve original text (deanonymization) has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. But then, after the model response, it would be good to have the data in the original form. I implemented the `PresidioReversibleAnonymizer`, which consists of two parts: 1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example: ``` { "PERSON": { "<anonymized>": "<original>", "John Doe": "Slim Shady" }, "PHONE_NUMBER": { "111-111-1111": "555-555-5555" } ... } ``` 2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it. Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM. ### Future works - instance anonymization - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. - better matching and substitution of fake values for real ones - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. John Doe -> John or Main St, New York -> New York) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs. - Q&A with anonymization - when I'm done writing all the functionality, I thought it would be a cool resource in documentation to write a notebook about retrieval from documents using anonymization. An iterative process, adding new recognizers to fit the data, lessons learned and what to look out for ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp <maks.operlejn@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-09-07 04:33:24 +00:00			`"""`
Better deanonymizer matching strategy (#11557) @baskaryan, @hwchase17 2023-10-09 18:10:29 +00:00			`Exact matching strategy for deanonymization.`
Data deanonymization (#10093) ### Description The feature for pseudonymizing data with ability to retrieve original text (deanonymization) has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. But then, after the model response, it would be good to have the data in the original form. I implemented the `PresidioReversibleAnonymizer`, which consists of two parts: 1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example: ``` { "PERSON": { "<anonymized>": "<original>", "John Doe": "Slim Shady" }, "PHONE_NUMBER": { "111-111-1111": "555-555-5555" } ... } ``` 2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it. Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM. ### Future works - instance anonymization - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. - better matching and substitution of fake values for real ones - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. John Doe -> John or Main St, New York -> New York) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs. - Q&A with anonymization - when I'm done writing all the functionality, I thought it would be a cool resource in documentation to write a notebook about retrieval from documents using anonymization. An iterative process, adding new recognizers to fit the data, lessons learned and what to look out for ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp <maks.operlejn@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-09-07 04:33:24 +00:00			`It replaces all the anonymized entities with the original ones.`

			`Args:`
			`text: text to deanonymize`
			`deanonymizer_mapping: mapping between anonymized entities and original ones"""`

			`# Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)`
			`for entity_type in deanonymizer_mapping:`
			`for anonymized, original in deanonymizer_mapping[entity_type].items():`
			`text = text.replace(anonymized, original)`
			`return text`
Better deanonymizer matching strategy (#11557) @baskaryan, @hwchase17 2023-10-09 18:10:29 +00:00

			`def case_insensitive_matching_strategy(`
			`text: str, deanonymizer_mapping: MappingDataType`
			`) -> str:`
			`"""`
			`Case insensitive matching strategy for deanonymization.`
			`It replaces all the anonymized entities with the original ones`
			`irrespective of their letter case.`

			`Args:`
			`text: text to deanonymize`
			`deanonymizer_mapping: mapping between anonymized entities and original ones`

			`Examples of matching:`
			`keanu reeves -> Keanu Reeves`
			`JOHN F. KENNEDY -> John F. Kennedy`
			`"""`

			`# Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)`
			`for entity_type in deanonymizer_mapping:`
			`for anonymized, original in deanonymizer_mapping[entity_type].items():`
			`# Use regular expressions for case-insensitive matching and replacing`
			`text = re.sub(anonymized, original, text, flags=re.IGNORECASE)`
			`return text`


			`def fuzzy_matching_strategy(`
			`text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3`
			`) -> str:`
			`"""`
			`Fuzzy matching strategy for deanonymization.`
			`It uses fuzzy matching to find the position of the anonymized entity in the text.`
			`It replaces all the anonymized entities with the original ones.`

			`Args:`
			`text: text to deanonymize`
			`deanonymizer_mapping: mapping between anonymized entities and original ones`
			`max_l_dist: maximum Levenshtein distance between the anonymized entity and the`
			`text segment to consider it a match`

			`Examples of matching:`
			`Kaenu Reves -> Keanu Reeves`
			`John F. Kennedy -> John Kennedy`
			`"""`

			`try:`
			`from fuzzysearch import find_near_matches`
			`except ImportError as e:`
			`raise ImportError(`
			`"Could not import fuzzysearch, please install with "`
			"`pip install fuzzysearch`."
			`) from e`

			`for entity_type in deanonymizer_mapping:`
			`for anonymized, original in deanonymizer_mapping[entity_type].items():`
			`matches = find_near_matches(anonymized, text, max_l_dist=max_l_dist)`
			`new_text = ""`
			`last_end = 0`
			`for m in matches:`
			`# add the text that isn't part of a match`
			`new_text += text[last_end : m.start]`
			`# add the replacement text`
			`new_text += original`
			`last_end = m.end`
			`# add the remaining text that wasn't part of a match`
			`new_text += text[last_end:]`
			`text = new_text`

			`return text`


			`def combined_exact_fuzzy_matching_strategy(`
			`text: str, deanonymizer_mapping: MappingDataType, max_l_dist: int = 3`
			`) -> str:`
			`"""`
			`RECOMMENDED STRATEGY.`
			`Combined exact and fuzzy matching strategy for deanonymization.`

			`Args:`
			`text: text to deanonymize`
			`deanonymizer_mapping: mapping between anonymized entities and original ones`
			`max_l_dist: maximum Levenshtein distance between the anonymized entity and the`
			`text segment to consider it a match`

			`Examples of matching:`
			`Kaenu Reves -> Keanu Reeves`
			`John F. Kennedy -> John Kennedy`
			`"""`
			`text = exact_matching_strategy(text, deanonymizer_mapping)`
			`text = fuzzy_matching_strategy(text, deanonymizer_mapping, max_l_dist)`
			`return text`


			`def ngram_fuzzy_matching_strategy(`
			`text: str,`
			`deanonymizer_mapping: MappingDataType,`
			`fuzzy_threshold: int = 85,`
			`use_variable_length: bool = True,`
			`) -> str:`
			`"""`
			`N-gram fuzzy matching strategy for deanonymization.`
			`It replaces all the anonymized entities with the original ones.`
			`It uses fuzzy matching to find the position of the anonymized entity in the text.`
			`It generates n-grams of the same length as the anonymized entity from the text and`
			`uses fuzzy matching to find the position of the anonymized entity in the text.`

			`Args:`
			`text: text to deanonymize`
			`deanonymizer_mapping: mapping between anonymized entities and original ones`
			`fuzzy_threshold: fuzzy matching threshold`
			`use_variable_length: whether to use (n-1, n, n+1)-grams or just n-grams`
			`"""`

			`def generate_ngrams(words_list: List[str], n: int) -> list:`
			`"""Generate n-grams from a list of words"""`
			`return [`
			`" ".join(words_list[i : i + n]) for i in range(len(words_list) - (n - 1))`
			`]`

			`try:`
			`from fuzzywuzzy import fuzz`
			`except ImportError as e:`
			`raise ImportError(`
			`"Could not import fuzzywuzzy, please install with "`
			"`pip install fuzzywuzzy`."
			`) from e`

			`text_words = text.split()`
			`replacements = []`
			`matched_indices: List[int] = []`

			`for entity_type in deanonymizer_mapping:`
			`for anonymized, original in deanonymizer_mapping[entity_type].items():`
			`anonymized_words = anonymized.split()`

			`if use_variable_length:`
			`gram_lengths = [`
			`len(anonymized_words) - 1,`
			`len(anonymized_words),`
			`len(anonymized_words) + 1,`
			`]`
			`else:`
			`gram_lengths = [len(anonymized_words)]`
			`for n in gram_lengths:`
			`if n > 0: # Take only positive values`
			`segments = generate_ngrams(text_words, n)`
			`for i, segment in enumerate(segments):`
			`if (`
			`fuzz.ratio(anonymized.lower(), segment.lower())`
			`> fuzzy_threshold`
			`and i not in matched_indices`
			`):`
			`replacements.append((i, n, original))`
			`# Add the matched segment indices to the list`
			`matched_indices.extend(range(i, i + n))`

			`# Sort replacements by index in reverse order`
			`replacements.sort(key=lambda x: x[0], reverse=True)`

			`# Apply replacements in reverse order to not affect subsequent indices`
			`for start, length, replacement in replacements:`
			`text_words[start : start + length] = replacement.split()`

			`return " ".join(text_words)`