langchain/libs/experimental/tests/unit_tests/test_data_anonymizer.py

from typing import Iterator, List

import pytest

from . import is_libcublas_available


@pytest.fixture(scope="module", autouse=True)
def check_spacy_model() -> Iterator[None]:
    import spacy

    if not spacy.util.is_package("en_core_web_lg"):
        pytest.skip(reason="Spacy model 'en_core_web_lg' not installed")
    yield


@pytest.fixture(scope="module", autouse=True)
def check_libcublas() -> Iterator[None]:
    if not is_libcublas_available():
        pytest.skip(reason="libcublas.so is not available")
    yield


@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
@pytest.mark.parametrize(
    "analyzed_fields,should_contain",
    [(["PERSON"], False), (["PHONE_NUMBER"], True), (None, False)],
)
def test_anonymize(analyzed_fields: List[str], should_contain: bool) -> None:
    """Test anonymizing a name in a simple sentence"""
    from langchain_experimental.data_anonymizer import PresidioAnonymizer

    text = "Hello, my name is John Doe."
    anonymizer = PresidioAnonymizer(analyzed_fields=analyzed_fields)
    anonymized_text = anonymizer.anonymize(text)
    assert ("John Doe" in anonymized_text) == should_contain


@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_anonymize_multiple() -> None:
    """Test anonymizing multiple items in a sentence"""
    from langchain_experimental.data_anonymizer import PresidioAnonymizer

    text = "John Smith's phone number is 313-666-7440 and email is johnsmith@gmail.com"
    anonymizer = PresidioAnonymizer()
    anonymized_text = anonymizer.anonymize(text)
    for phrase in ["John Smith", "313-666-7440", "johnsmith@gmail.com"]:
        assert phrase not in anonymized_text


@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_check_instances() -> None:
    """Test anonymizing multiple items in a sentence"""
    from langchain_experimental.data_anonymizer import PresidioAnonymizer

    text = (
        "This is John Smith. John Smith works in a bakery." "John Smith is a good guy"
    )
    anonymizer = PresidioAnonymizer(["PERSON"], faker_seed=42)
    anonymized_text = anonymizer.anonymize(text)
    assert anonymized_text.count("Connie Lawrence") == 3

    # New name should be generated
    anonymized_text = anonymizer.anonymize(text)
    assert anonymized_text.count("Connie Lawrence") == 0


@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_anonymize_with_custom_operator() -> None:
    """Test anonymize a name with a custom operator"""
    from presidio_anonymizer.entities import OperatorConfig

    from langchain_experimental.data_anonymizer import PresidioAnonymizer

    custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": "NAME"})}
    anonymizer = PresidioAnonymizer(operators=custom_operator)

    text = "Jane Doe was here."

    anonymized_text = anonymizer.anonymize(text)
    assert anonymized_text == "NAME was here."


@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_add_recognizer_operator() -> None:
    """
    Test add recognizer and anonymize a new type of entity and with a custom operator
    """
    from presidio_analyzer import PatternRecognizer
    from presidio_anonymizer.entities import OperatorConfig

    from langchain_experimental.data_anonymizer import PresidioAnonymizer

    anonymizer = PresidioAnonymizer(analyzed_fields=[])
    titles_list = ["Sir", "Madam", "Professor"]
    custom_recognizer = PatternRecognizer(
        supported_entity="TITLE", deny_list=titles_list
    )
    anonymizer.add_recognizer(custom_recognizer)

    # anonymizing with custom recognizer
    text = "Madam Jane Doe was here."
    anonymized_text = anonymizer.anonymize(text)
    assert anonymized_text == "<TITLE> Jane Doe was here."

    # anonymizing with custom recognizer and operator
    custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})}
    anonymizer.add_operators(custom_operator)
    anonymized_text = anonymizer.anonymize(text)
    assert anonymized_text == "Dear Jane Doe was here."


@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_non_faker_values() -> None:
    """Test anonymizing multiple items in a sentence without faker values"""
    from langchain_experimental.data_anonymizer import PresidioAnonymizer

    text = (
        "My name is John Smith. Your name is Adam Smith. Her name is Jane Smith."
        "Our names are: John Smith, Adam Smith, Jane Smith."
    )
    expected_result = (
        "My name is <PERSON>. Your name is <PERSON_2>. Her name is <PERSON_3>."
        "Our names are: <PERSON>, <PERSON_2>, <PERSON_3>."
    )
    anonymizer = PresidioAnonymizer(add_default_faker_operators=False)
    anonymized_text = anonymizer.anonymize(text)
    assert anonymized_text == expected_result
Add data anonymizer (#9863) ### Description The feature for anonymizing data has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. Anonynization consists of two steps: 1. Identification: Identify all data fields that contain personally identifiable information (PII). 2. Replacement: Replace all PIIs with pseudo values or codes that do not reveal any personal information about the individual but can be used for reference. We're not using regular encryption, because the language model won't be able to understand the meaning or context of the encrypted data. We use Microsoft Presidio together with Faker framework for anonymization purposes because of the wide range of functionalities they provide. The full implementation is available in `PresidioAnonymizer`. ### Future works - deanonymization - add the ability to reverse anonymization. For example, the workflow could look like this: `anonymize -> LLMChain -> deanonymize`. By doing this, we will retain anonymity in requests to, for example, OpenAI, and then be able restore the original data. - instance anonymization - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp <maks.operlejn@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-08-30 17:39:44 +00:00			`from typing import Iterator, List`

			`import pytest`
fix linting 2023-09-11 17:36:01 +00:00
check if libcublas is available before running extended tests 2023-09-11 17:26:41 +00:00			`from . import is_libcublas_available`
Add data anonymizer (#9863) ### Description The feature for anonymizing data has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. Anonynization consists of two steps: 1. Identification: Identify all data fields that contain personally identifiable information (PII). 2. Replacement: Replace all PIIs with pseudo values or codes that do not reveal any personal information about the individual but can be used for reference. We're not using regular encryption, because the language model won't be able to understand the meaning or context of the encrypted data. We use Microsoft Presidio together with Faker framework for anonymization purposes because of the wide range of functionalities they provide. The full implementation is available in `PresidioAnonymizer`. ### Future works - deanonymization - add the ability to reverse anonymization. For example, the workflow could look like this: `anonymize -> LLMChain -> deanonymize`. By doing this, we will retain anonymity in requests to, for example, OpenAI, and then be able restore the original data. - instance anonymization - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp <maks.operlejn@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-08-30 17:39:44 +00:00

			`@pytest.fixture(scope="module", autouse=True)`
			`def check_spacy_model() -> Iterator[None]:`
			`import spacy`

			`if not spacy.util.is_package("en_core_web_lg"):`
			`pytest.skip(reason="Spacy model 'en_core_web_lg' not installed")`
			`yield`

black formatting 2023-09-11 17:33:43 +00:00
check if libcublas is available before running extended tests 2023-09-11 17:26:41 +00:00			`@pytest.fixture(scope="module", autouse=True)`
			`def check_libcublas() -> Iterator[None]:`
			`if not is_libcublas_available():`
			`pytest.skip(reason="libcublas.so is not available")`
			`yield`


Add data anonymizer (#9863) ### Description The feature for anonymizing data has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. Anonynization consists of two steps: 1. Identification: Identify all data fields that contain personally identifiable information (PII). 2. Replacement: Replace all PIIs with pseudo values or codes that do not reveal any personal information about the individual but can be used for reference. We're not using regular encryption, because the language model won't be able to understand the meaning or context of the encrypted data. We use Microsoft Presidio together with Faker framework for anonymization purposes because of the wide range of functionalities they provide. The full implementation is available in `PresidioAnonymizer`. ### Future works - deanonymization - add the ability to reverse anonymization. For example, the workflow could look like this: `anonymize -> LLMChain -> deanonymize`. By doing this, we will retain anonymity in requests to, for example, OpenAI, and then be able restore the original data. - instance anonymization - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp <maks.operlejn@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-08-30 17:39:44 +00:00			`@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")`
			`@pytest.mark.parametrize(`
			`"analyzed_fields,should_contain",`
			`[(["PERSON"], False), (["PHONE_NUMBER"], True), (None, False)],`
			`)`
			`def test_anonymize(analyzed_fields: List[str], should_contain: bool) -> None:`
			`"""Test anonymizing a name in a simple sentence"""`
			`from langchain_experimental.data_anonymizer import PresidioAnonymizer`

			`text = "Hello, my name is John Doe."`
			`anonymizer = PresidioAnonymizer(analyzed_fields=analyzed_fields)`
			`anonymized_text = anonymizer.anonymize(text)`
			`assert ("John Doe" in anonymized_text) == should_contain`


			`@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")`
			`def test_anonymize_multiple() -> None:`
			`"""Test anonymizing multiple items in a sentence"""`
			`from langchain_experimental.data_anonymizer import PresidioAnonymizer`

			`text = "John Smith's phone number is 313-666-7440 and email is johnsmith@gmail.com"`
			`anonymizer = PresidioAnonymizer()`
			`anonymized_text = anonymizer.anonymize(text)`
			`for phrase in ["John Smith", "313-666-7440", "johnsmith@gmail.com"]:`
			`assert phrase not in anonymized_text`


Instance anonymization (#10501) ### Description Add instance anonymization - if `John Doe` will appear twice in the text, it will be treated as the same entity. The difference between `PresidioAnonymizer` and `PresidioReversibleAnonymizer` is that only the second one has a built-in memory, so it will remember anonymization mapping for multiple texts: ``` >>> anonymizer = PresidioAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Brett Russell. Hi Brett Russell!' ``` ``` >>> anonymizer = PresidioReversibleAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' ``` ### Twitter handle @deepsense_ai / @MaksOpp ### Tag maintainer @baskaryan @hwchase17 @hinthornw --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-10-05 18:23:02 +00:00			`@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")`
			`def test_check_instances() -> None:`
			`"""Test anonymizing multiple items in a sentence"""`
			`from langchain_experimental.data_anonymizer import PresidioAnonymizer`

			`text = (`
			`"This is John Smith. John Smith works in a bakery." "John Smith is a good guy"`
			`)`
			`anonymizer = PresidioAnonymizer(["PERSON"], faker_seed=42)`
			`anonymized_text = anonymizer.anonymize(text)`
			`assert anonymized_text.count("Connie Lawrence") == 3`

			`# New name should be generated`
			`anonymized_text = anonymizer.anonymize(text)`
			`assert anonymized_text.count("Connie Lawrence") == 0`


Add data anonymizer (#9863) ### Description The feature for anonymizing data has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. Anonynization consists of two steps: 1. Identification: Identify all data fields that contain personally identifiable information (PII). 2. Replacement: Replace all PIIs with pseudo values or codes that do not reveal any personal information about the individual but can be used for reference. We're not using regular encryption, because the language model won't be able to understand the meaning or context of the encrypted data. We use Microsoft Presidio together with Faker framework for anonymization purposes because of the wide range of functionalities they provide. The full implementation is available in `PresidioAnonymizer`. ### Future works - deanonymization - add the ability to reverse anonymization. For example, the workflow could look like this: `anonymize -> LLMChain -> deanonymize`. By doing this, we will retain anonymity in requests to, for example, OpenAI, and then be able restore the original data. - instance anonymization - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp <maks.operlejn@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-08-30 17:39:44 +00:00			`@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")`
			`def test_anonymize_with_custom_operator() -> None:`
			`"""Test anonymize a name with a custom operator"""`
			`from presidio_anonymizer.entities import OperatorConfig`

			`from langchain_experimental.data_anonymizer import PresidioAnonymizer`

Instance anonymization (#10501) ### Description Add instance anonymization - if `John Doe` will appear twice in the text, it will be treated as the same entity. The difference between `PresidioAnonymizer` and `PresidioReversibleAnonymizer` is that only the second one has a built-in memory, so it will remember anonymization mapping for multiple texts: ``` >>> anonymizer = PresidioAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Brett Russell. Hi Brett Russell!' ``` ``` >>> anonymizer = PresidioReversibleAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' ``` ### Twitter handle @deepsense_ai / @MaksOpp ### Tag maintainer @baskaryan @hwchase17 @hinthornw --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-10-05 18:23:02 +00:00			`custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": "NAME"})}`
Add data anonymizer (#9863) ### Description The feature for anonymizing data has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. Anonynization consists of two steps: 1. Identification: Identify all data fields that contain personally identifiable information (PII). 2. Replacement: Replace all PIIs with pseudo values or codes that do not reveal any personal information about the individual but can be used for reference. We're not using regular encryption, because the language model won't be able to understand the meaning or context of the encrypted data. We use Microsoft Presidio together with Faker framework for anonymization purposes because of the wide range of functionalities they provide. The full implementation is available in `PresidioAnonymizer`. ### Future works - deanonymization - add the ability to reverse anonymization. For example, the workflow could look like this: `anonymize -> LLMChain -> deanonymize`. By doing this, we will retain anonymity in requests to, for example, OpenAI, and then be able restore the original data. - instance anonymization - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp <maks.operlejn@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-08-30 17:39:44 +00:00			`anonymizer = PresidioAnonymizer(operators=custom_operator)`

			`text = "Jane Doe was here."`

			`anonymized_text = anonymizer.anonymize(text)`
Instance anonymization (#10501) ### Description Add instance anonymization - if `John Doe` will appear twice in the text, it will be treated as the same entity. The difference between `PresidioAnonymizer` and `PresidioReversibleAnonymizer` is that only the second one has a built-in memory, so it will remember anonymization mapping for multiple texts: ``` >>> anonymizer = PresidioAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Brett Russell. Hi Brett Russell!' ``` ``` >>> anonymizer = PresidioReversibleAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' ``` ### Twitter handle @deepsense_ai / @MaksOpp ### Tag maintainer @baskaryan @hwchase17 @hinthornw --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-10-05 18:23:02 +00:00			`assert anonymized_text == "NAME was here."`
Add data anonymizer (#9863) ### Description The feature for anonymizing data has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. Anonynization consists of two steps: 1. Identification: Identify all data fields that contain personally identifiable information (PII). 2. Replacement: Replace all PIIs with pseudo values or codes that do not reveal any personal information about the individual but can be used for reference. We're not using regular encryption, because the language model won't be able to understand the meaning or context of the encrypted data. We use Microsoft Presidio together with Faker framework for anonymization purposes because of the wide range of functionalities they provide. The full implementation is available in `PresidioAnonymizer`. ### Future works - deanonymization - add the ability to reverse anonymization. For example, the workflow could look like this: `anonymize -> LLMChain -> deanonymize`. By doing this, we will retain anonymity in requests to, for example, OpenAI, and then be able restore the original data. - instance anonymization - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp <maks.operlejn@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-08-30 17:39:44 +00:00

			`@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")`
			`def test_add_recognizer_operator() -> None:`
			`"""`
			`Test add recognizer and anonymize a new type of entity and with a custom operator`
			`"""`
			`from presidio_analyzer import PatternRecognizer`
			`from presidio_anonymizer.entities import OperatorConfig`

			`from langchain_experimental.data_anonymizer import PresidioAnonymizer`

			`anonymizer = PresidioAnonymizer(analyzed_fields=[])`
			`titles_list = ["Sir", "Madam", "Professor"]`
			`custom_recognizer = PatternRecognizer(`
			`supported_entity="TITLE", deny_list=titles_list`
			`)`
			`anonymizer.add_recognizer(custom_recognizer)`

			`# anonymizing with custom recognizer`
			`text = "Madam Jane Doe was here."`
			`anonymized_text = anonymizer.anonymize(text)`
			`assert anonymized_text == "<TITLE> Jane Doe was here."`

			`# anonymizing with custom recognizer and operator`
			`custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})}`
			`anonymizer.add_operators(custom_operator)`
			`anonymized_text = anonymizer.anonymize(text)`
			`assert anonymized_text == "Dear Jane Doe was here."`
Instance anonymization (#10501) ### Description Add instance anonymization - if `John Doe` will appear twice in the text, it will be treated as the same entity. The difference between `PresidioAnonymizer` and `PresidioReversibleAnonymizer` is that only the second one has a built-in memory, so it will remember anonymization mapping for multiple texts: ``` >>> anonymizer = PresidioAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Brett Russell. Hi Brett Russell!' ``` ``` >>> anonymizer = PresidioReversibleAnonymizer() >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' >>> anonymizer.anonymize("My name is John Doe. Hi John Doe!") 'My name is Noah Rhodes. Hi Noah Rhodes!' ``` ### Twitter handle @deepsense_ai / @MaksOpp ### Tag maintainer @baskaryan @hwchase17 @hinthornw --------- Co-authored-by: Bagatur <baskaryan@gmail.com> 2023-10-05 18:23:02 +00:00

			`@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")`
			`def test_non_faker_values() -> None:`
			`"""Test anonymizing multiple items in a sentence without faker values"""`
			`from langchain_experimental.data_anonymizer import PresidioAnonymizer`

			`text = (`
			`"My name is John Smith. Your name is Adam Smith. Her name is Jane Smith."`
			`"Our names are: John Smith, Adam Smith, Jane Smith."`
			`)`
			`expected_result = (`
			`"My name is <PERSON>. Your name is <PERSON_2>. Her name is <PERSON_3>."`
			`"Our names are: <PERSON>, <PERSON_2>, <PERSON_3>."`
			`)`
			`anonymizer = PresidioAnonymizer(add_default_faker_operators=False)`
			`anonymized_text = anonymizer.anonymize(text)`
			`assert anonymized_text == expected_result`