langchain/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py

import re
from collections import defaultdict
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Dict, List

if TYPE_CHECKING:
    from presidio_analyzer import RecognizerResult
    from presidio_anonymizer.entities import EngineResult

MappingDataType = Dict[str, Dict[str, str]]


def format_duplicated_operator(operator_name: str, count: int) -> str:
    """Format the operator name with the count"""

    clean_operator_name = re.sub(r"[<>]", "", operator_name)
    clean_operator_name = re.sub(r"_\d+$", "", clean_operator_name)

    if operator_name.startswith("<") and operator_name.endswith(">"):
        return f"<{clean_operator_name}_{count}>"
    else:
        return f"{clean_operator_name}_{count}"


@dataclass
class DeanonymizerMapping:
    mapping: MappingDataType = field(
        default_factory=lambda: defaultdict(lambda: defaultdict(str))
    )

    @property
    def data(self) -> MappingDataType:
        """Return the deanonymizer mapping"""
        return {k: dict(v) for k, v in self.mapping.items()}

    def update(self, new_mapping: MappingDataType) -> None:
        """Update the deanonymizer mapping with new values
        Duplicated values will not be added
        If there are multiple entities of the same type, the mapping will
        include a count to differentiate them. For example, if there are
        two names in the input text, the mapping will include NAME_1 and NAME_2.
        """
        seen_values = set()

        for entity_type, values in new_mapping.items():
            count = len(self.mapping[entity_type]) + 1

            for key, value in values.items():
                if (
                    value not in seen_values
                    and value not in self.mapping[entity_type].values()
                ):
                    new_key = (
                        format_duplicated_operator(key, count)
                        if key in self.mapping[entity_type]
                        else key
                    )

                    self.mapping[entity_type][new_key] = value
                    seen_values.add(value)
                    count += 1


def create_anonymizer_mapping(
    original_text: str,
    analyzer_results: List["RecognizerResult"],
    anonymizer_results: "EngineResult",
    is_reversed: bool = False,
) -> MappingDataType:
    """Creates or updates the mapping used to anonymize and/or deanonymize text.

    This method exploits the results returned by the
    analysis and anonymization processes.

    If is_reversed is True, it constructs a mapping from each original
    entity to its anonymized value.

    If is_reversed is False, it constructs a mapping from each
    anonymized entity back to its original text value.

    If there are multiple entities of the same type, the mapping will
    include a count to differentiate them. For example, if there are
    two names in the input text, the mapping will include NAME_1 and NAME_2.

    Example of mapping:
    {
        "PERSON": {
            "<original>": "<anonymized>",
            "John Doe": "Slim Shady"
        },
        "PHONE_NUMBER": {
            "111-111-1111": "555-555-5555"
        }
        ...
    }
    """
    # We are able to zip and loop through both lists because we expect
    # them to return corresponding entities for each identified piece
    # of analyzable data from our input.

    # We sort them by their 'start' attribute because it allows us to
    # match corresponding entities by their position in the input text.
    analyzer_results.sort(key=lambda d: d.start)
    anonymizer_results.items.sort(key=lambda d: d.start)

    mapping: MappingDataType = defaultdict(dict)
    count: dict = defaultdict(int)

    for analyzed, anonymized in zip(analyzer_results, anonymizer_results.items):
        original_value = original_text[analyzed.start : analyzed.end]
        entity_type = anonymized.entity_type

        if is_reversed:
            cond = original_value in mapping[entity_type].values()
        else:
            cond = original_value in mapping[entity_type]

        if cond:
            continue

        if (
            anonymized.text in mapping[entity_type].values()
            or anonymized.text in mapping[entity_type]
        ):
            anonymized_value = format_duplicated_operator(
                anonymized.text, count[entity_type] + 2
            )
            count[entity_type] += 1
        else:
            anonymized_value = anonymized.text

        mapping_key, mapping_value = (
            (anonymized_value, original_value)
            if is_reversed
            else (original_value, anonymized_value)
        )

        mapping[entity_type][mapping_key] = mapping_value

    return mapping
Instance anonymization (#10501) ### Description Add instance anonymization - if `John Doe` will appear twice in the text, it will be treated as the same entity. The difference between `PresidioAnonymizer` and `PresidioReversibleAnonymizer` is that only the second one has a built-in memory, so it will remember anonymization mapping for multiple texts: ``` >>> anonymizer = PresidioAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Brett Russell. Hi Brett Russell!' ``` ``` >>> anonymizer = PresidioReversibleAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' ``` ### Twitter handle @deepsense_ai / @MaksOpp ### Tag maintainer @baskaryan @hwchase17 @hinthornw --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-10-05 18:23:02 +00:00			`import re`
Data deanonymization (#10093) ### Description The feature for pseudonymizing data with ability to retrieve original text (deanonymization) has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. But then, after the model response, it would be good to have the data in the original form. I implemented the `PresidioReversibleAnonymizer`, which consists of two parts: 1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example: ``` { "PERSON": { "<anonymized>": "<original>", "John Doe": "Slim Shady" }, "PHONE_NUMBER": { "111-111-1111": "555-555-5555" } ... } ``` 2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it. Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM. ### Future works - instance anonymization - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. - better matching and substitution of fake values for real ones - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. John Doe -> John or Main St, New York -> New York) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs. - Q&A with anonymization - when I'm done writing all the functionality, I thought it would be a cool resource in documentation to write a notebook about retrieval from documents using anonymization. An iterative process, adding new recognizers to fit the data, lessons learned and what to look out for ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp <maks.operlejn@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-09-07 04:33:24 +00:00			`from collections import defaultdict`
			`from dataclasses import dataclass, field`
core[patch], langchain[patch], experimental[patch]: import CI (#14414) 2023-12-08 19:28:55 +00:00			`from typing import TYPE_CHECKING, Dict, List`
Instance anonymization (#10501) ### Description Add instance anonymization - if `John Doe` will appear twice in the text, it will be treated as the same entity. The difference between `PresidioAnonymizer` and `PresidioReversibleAnonymizer` is that only the second one has a built-in memory, so it will remember anonymization mapping for multiple texts: ``` >>> anonymizer = PresidioAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Brett Russell. Hi Brett Russell!' ``` ``` >>> anonymizer = PresidioReversibleAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' ``` ### Twitter handle @deepsense_ai / @MaksOpp ### Tag maintainer @baskaryan @hwchase17 @hinthornw --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-10-05 18:23:02 +00:00
core[patch], langchain[patch], experimental[patch]: import CI (#14414) 2023-12-08 19:28:55 +00:00			`if TYPE_CHECKING:`
			`from presidio_analyzer import RecognizerResult`
			`from presidio_anonymizer.entities import EngineResult`
Data deanonymization (#10093) ### Description The feature for pseudonymizing data with ability to retrieve original text (deanonymization) has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. But then, after the model response, it would be good to have the data in the original form. I implemented the `PresidioReversibleAnonymizer`, which consists of two parts: 1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example: ``` { "PERSON": { "<anonymized>": "<original>", "John Doe": "Slim Shady" }, "PHONE_NUMBER": { "111-111-1111": "555-555-5555" } ... } ``` 2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it. Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM. ### Future works - instance anonymization - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. - better matching and substitution of fake values for real ones - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. John Doe -> John or Main St, New York -> New York) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs. - Q&A with anonymization - when I'm done writing all the functionality, I thought it would be a cool resource in documentation to write a notebook about retrieval from documents using anonymization. An iterative process, adding new recognizers to fit the data, lessons learned and what to look out for ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp <maks.operlejn@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-09-07 04:33:24 +00:00
			`MappingDataType = Dict[str, Dict[str, str]]`


Instance anonymization (#10501) ### Description Add instance anonymization - if `John Doe` will appear twice in the text, it will be treated as the same entity. The difference between `PresidioAnonymizer` and `PresidioReversibleAnonymizer` is that only the second one has a built-in memory, so it will remember anonymization mapping for multiple texts: ``` >>> anonymizer = PresidioAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Brett Russell. Hi Brett Russell!' ``` ``` >>> anonymizer = PresidioReversibleAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' ``` ### Twitter handle @deepsense_ai / @MaksOpp ### Tag maintainer @baskaryan @hwchase17 @hinthornw --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-10-05 18:23:02 +00:00			`def format_duplicated_operator(operator_name: str, count: int) -> str:`
			`"""Format the operator name with the count"""`

			`clean_operator_name = re.sub(r"[<>]", "", operator_name)`
			`clean_operator_name = re.sub(r"_\d+$", "", clean_operator_name)`

			`if operator_name.startswith("<") and operator_name.endswith(">"):`
			`return f"<{clean_operator_name}_{count}>"`
			`else:`
			`return f"{clean_operator_name}_{count}"`


Data deanonymization (#10093) ### Description The feature for pseudonymizing data with ability to retrieve original text (deanonymization) has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. But then, after the model response, it would be good to have the data in the original form. I implemented the `PresidioReversibleAnonymizer`, which consists of two parts: 1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example: ``` { "PERSON": { "<anonymized>": "<original>", "John Doe": "Slim Shady" }, "PHONE_NUMBER": { "111-111-1111": "555-555-5555" } ... } ``` 2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it. Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM. ### Future works - instance anonymization - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. - better matching and substitution of fake values for real ones - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. John Doe -> John or Main St, New York -> New York) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs. - Q&A with anonymization - when I'm done writing all the functionality, I thought it would be a cool resource in documentation to write a notebook about retrieval from documents using anonymization. An iterative process, adding new recognizers to fit the data, lessons learned and what to look out for ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp <maks.operlejn@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-09-07 04:33:24 +00:00			`@dataclass`
			`class DeanonymizerMapping:`
			`mapping: MappingDataType = field(`
			`default_factory=lambda: defaultdict(lambda: defaultdict(str))`
			`)`

			`@property`
			`def data(self) -> MappingDataType:`
			`"""Return the deanonymizer mapping"""`
			`return {k: dict(v) for k, v in self.mapping.items()}`

			`def update(self, new_mapping: MappingDataType) -> None:`
Instance anonymization (#10501) ### Description Add instance anonymization - if `John Doe` will appear twice in the text, it will be treated as the same entity. The difference between `PresidioAnonymizer` and `PresidioReversibleAnonymizer` is that only the second one has a built-in memory, so it will remember anonymization mapping for multiple texts: ``` >>> anonymizer = PresidioAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Brett Russell. Hi Brett Russell!' ``` ``` >>> anonymizer = PresidioReversibleAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' ``` ### Twitter handle @deepsense_ai / @MaksOpp ### Tag maintainer @baskaryan @hwchase17 @hinthornw --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-10-05 18:23:02 +00:00			`"""Update the deanonymizer mapping with new values`
			`Duplicated values will not be added`
			`If there are multiple entities of the same type, the mapping will`
			`include a count to differentiate them. For example, if there are`
			`two names in the input text, the mapping will include NAME_1 and NAME_2.`
			`"""`
			`seen_values = set()`

Data deanonymization (#10093) ### Description The feature for pseudonymizing data with ability to retrieve original text (deanonymization) has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. But then, after the model response, it would be good to have the data in the original form. I implemented the `PresidioReversibleAnonymizer`, which consists of two parts: 1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example: ``` { "PERSON": { "<anonymized>": "<original>", "John Doe": "Slim Shady" }, "PHONE_NUMBER": { "111-111-1111": "555-555-5555" } ... } ``` 2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it. Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM. ### Future works - instance anonymization - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. - better matching and substitution of fake values for real ones - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. John Doe -> John or Main St, New York -> New York) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs. - Q&A with anonymization - when I'm done writing all the functionality, I thought it would be a cool resource in documentation to write a notebook about retrieval from documents using anonymization. An iterative process, adding new recognizers to fit the data, lessons learned and what to look out for ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp <maks.operlejn@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-09-07 04:33:24 +00:00			`for entity_type, values in new_mapping.items():`
Instance anonymization (#10501) ### Description Add instance anonymization - if `John Doe` will appear twice in the text, it will be treated as the same entity. The difference between `PresidioAnonymizer` and `PresidioReversibleAnonymizer` is that only the second one has a built-in memory, so it will remember anonymization mapping for multiple texts: ``` >>> anonymizer = PresidioAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Brett Russell. Hi Brett Russell!' ``` ``` >>> anonymizer = PresidioReversibleAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' ``` ### Twitter handle @deepsense_ai / @MaksOpp ### Tag maintainer @baskaryan @hwchase17 @hinthornw --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-10-05 18:23:02 +00:00			`count = len(self.mapping[entity_type]) + 1`

			`for key, value in values.items():`
			`if (`
			`value not in seen_values`
			`and value not in self.mapping[entity_type].values()`
			`):`
			`new_key = (`
			`format_duplicated_operator(key, count)`
			`if key in self.mapping[entity_type]`
			`else key`
			`)`

			`self.mapping[entity_type][new_key] = value`
			`seen_values.add(value)`
			`count += 1`


			`def create_anonymizer_mapping(`
			`original_text: str,`
core[patch], langchain[patch], experimental[patch]: import CI (#14414) 2023-12-08 19:28:55 +00:00			`analyzer_results: List["RecognizerResult"],`
			`anonymizer_results: "EngineResult",`
Instance anonymization (#10501) ### Description Add instance anonymization - if `John Doe` will appear twice in the text, it will be treated as the same entity. The difference between `PresidioAnonymizer` and `PresidioReversibleAnonymizer` is that only the second one has a built-in memory, so it will remember anonymization mapping for multiple texts: ``` >>> anonymizer = PresidioAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Brett Russell. Hi Brett Russell!' ``` ``` >>> anonymizer = PresidioReversibleAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' ``` ### Twitter handle @deepsense_ai / @MaksOpp ### Tag maintainer @baskaryan @hwchase17 @hinthornw --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-10-05 18:23:02 +00:00			`is_reversed: bool = False,`
			`) -> MappingDataType:`
			`"""Creates or updates the mapping used to anonymize and/or deanonymize text.`

			`This method exploits the results returned by the`
			`analysis and anonymization processes.`

			`If is_reversed is True, it constructs a mapping from each original`
			`entity to its anonymized value.`

			`If is_reversed is False, it constructs a mapping from each`
			`anonymized entity back to its original text value.`

			`If there are multiple entities of the same type, the mapping will`
			`include a count to differentiate them. For example, if there are`
			`two names in the input text, the mapping will include NAME_1 and NAME_2.`

			`Example of mapping:`
			`{`
			`"PERSON": {`
			`"<original>": "<anonymized>",`
			`"John Doe": "Slim Shady"`
			`},`
			`"PHONE_NUMBER": {`
			`"111-111-1111": "555-555-5555"`
			`}`
			`...`
			`}`
			`"""`
			`# We are able to zip and loop through both lists because we expect`
			`# them to return corresponding entities for each identified piece`
			`# of analyzable data from our input.`

			`# We sort them by their 'start' attribute because it allows us to`
			`# match corresponding entities by their position in the input text.`
			`analyzer_results.sort(key=lambda d: d.start)`
			`anonymizer_results.items.sort(key=lambda d: d.start)`

			`mapping: MappingDataType = defaultdict(dict)`
			`count: dict = defaultdict(int)`

			`for analyzed, anonymized in zip(analyzer_results, anonymizer_results.items):`
			`original_value = original_text[analyzed.start : analyzed.end]`
			`entity_type = anonymized.entity_type`

			`if is_reversed:`
			`cond = original_value in mapping[entity_type].values()`
			`else:`
			`cond = original_value in mapping[entity_type]`

			`if cond:`
			`continue`

			`if (`
			`anonymized.text in mapping[entity_type].values()`
			`or anonymized.text in mapping[entity_type]`
			`):`
			`anonymized_value = format_duplicated_operator(`
			`anonymized.text, count[entity_type] + 2`
			`)`
			`count[entity_type] += 1`
			`else:`
			`anonymized_value = anonymized.text`

			`mapping_key, mapping_value = (`
			`(anonymized_value, original_value)`
			`if is_reversed`
			`else (original_value, anonymized_value)`
			`)`

			`mapping[entity_type][mapping_key] = mapping_value`

			`return mapping`