mirror of
https://github.com/hwchase17/langchain
synced 2024-10-29 17:07:25 +00:00
4cc4534d81
### Description The feature for pseudonymizing data with ability to retrieve original text (deanonymization) has been implemented. In order to protect private data, such as when querying external APIs (OpenAI), it is worth pseudonymizing sensitive data to maintain full privacy. But then, after the model response, it would be good to have the data in the original form. I implemented the `PresidioReversibleAnonymizer`, which consists of two parts: 1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example: ``` { "PERSON": { "<anonymized>": "<original>", "John Doe": "Slim Shady" }, "PHONE_NUMBER": { "111-111-1111": "555-555-5555" } ... } ``` 2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it. Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM. ### Future works - **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object. - **better matching and substitution of fake values for real ones** - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. *John Doe* -> *John* or *Main St, New York* -> *New York*) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs. - **Q&A with anonymization** - when I'm done writing all the functionality, I thought it would be a cool resource in documentation to write a notebook about retrieval from documents using anonymization. An iterative process, adding new recognizers to fit the data, lessons learned and what to look out for ### Twitter handle @deepsense_ai / @MaksOpp --------- Co-authored-by: MaksOpp <maks.operlejn@gmail.com> Co-authored-by: Bagatur <baskaryan@gmail.com>
308 lines
10 KiB
Python
308 lines
10 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
|
|
|
|
import yaml
|
|
|
|
from langchain_experimental.data_anonymizer.base import (
|
|
AnonymizerBase,
|
|
ReversibleAnonymizerBase,
|
|
)
|
|
from langchain_experimental.data_anonymizer.deanonymizer_mapping import (
|
|
DeanonymizerMapping,
|
|
MappingDataType,
|
|
)
|
|
from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
|
|
default_matching_strategy,
|
|
)
|
|
from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
|
|
get_pseudoanonymizer_mapping,
|
|
)
|
|
|
|
try:
|
|
from presidio_analyzer import AnalyzerEngine
|
|
except ImportError as e:
|
|
raise ImportError(
|
|
"Could not import presidio_analyzer, please install with "
|
|
"`pip install presidio-analyzer`. You will also need to download a "
|
|
"spaCy model to use the analyzer, e.g. "
|
|
"`python -m spacy download en_core_web_lg`."
|
|
) from e
|
|
try:
|
|
from presidio_anonymizer import AnonymizerEngine
|
|
from presidio_anonymizer.entities import OperatorConfig
|
|
except ImportError as e:
|
|
raise ImportError(
|
|
"Could not import presidio_anonymizer, please install with "
|
|
"`pip install presidio-anonymizer`."
|
|
) from e
|
|
|
|
if TYPE_CHECKING:
|
|
from presidio_analyzer import EntityRecognizer, RecognizerResult
|
|
from presidio_anonymizer.entities import EngineResult
|
|
|
|
|
|
class PresidioAnonymizerBase(AnonymizerBase):
|
|
def __init__(
|
|
self,
|
|
analyzed_fields: Optional[List[str]] = None,
|
|
operators: Optional[Dict[str, OperatorConfig]] = None,
|
|
faker_seed: Optional[int] = None,
|
|
):
|
|
"""
|
|
Args:
|
|
analyzed_fields: List of fields to detect and then anonymize.
|
|
Defaults to all entities supported by Microsoft Presidio.
|
|
operators: Operators to use for anonymization.
|
|
Operators allow for custom anonymization of detected PII.
|
|
Learn more:
|
|
https://microsoft.github.io/presidio/tutorial/10_simple_anonymization/
|
|
faker_seed: Seed used to initialize faker.
|
|
Defaults to None, in which case faker will be seeded randomly
|
|
and provide random values.
|
|
"""
|
|
self.analyzed_fields = (
|
|
analyzed_fields
|
|
if analyzed_fields is not None
|
|
else list(get_pseudoanonymizer_mapping().keys())
|
|
)
|
|
self.operators = (
|
|
operators
|
|
if operators is not None
|
|
else {
|
|
field: OperatorConfig(
|
|
operator_name="custom", params={"lambda": faker_function}
|
|
)
|
|
for field, faker_function in get_pseudoanonymizer_mapping(
|
|
faker_seed
|
|
).items()
|
|
}
|
|
)
|
|
self._analyzer = AnalyzerEngine()
|
|
self._anonymizer = AnonymizerEngine()
|
|
|
|
def add_recognizer(self, recognizer: EntityRecognizer) -> None:
|
|
"""Add a recognizer to the analyzer
|
|
|
|
Args:
|
|
recognizer: Recognizer to add to the analyzer.
|
|
"""
|
|
self._analyzer.registry.add_recognizer(recognizer)
|
|
self.analyzed_fields.extend(recognizer.supported_entities)
|
|
|
|
def add_operators(self, operators: Dict[str, OperatorConfig]) -> None:
|
|
"""Add operators to the anonymizer
|
|
|
|
Args:
|
|
operators: Operators to add to the anonymizer.
|
|
"""
|
|
self.operators.update(operators)
|
|
|
|
|
|
class PresidioAnonymizer(PresidioAnonymizerBase):
|
|
def _anonymize(self, text: str) -> str:
|
|
"""Anonymize text.
|
|
Each PII entity is replaced with a fake value.
|
|
Each time fake values will be different, as they are generated randomly.
|
|
|
|
Args:
|
|
text: text to anonymize
|
|
"""
|
|
results = self._analyzer.analyze(
|
|
text,
|
|
entities=self.analyzed_fields,
|
|
language="en",
|
|
)
|
|
|
|
return self._anonymizer.anonymize(
|
|
text,
|
|
analyzer_results=results,
|
|
operators=self.operators,
|
|
).text
|
|
|
|
|
|
class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase):
|
|
def __init__(
|
|
self,
|
|
analyzed_fields: Optional[List[str]] = None,
|
|
operators: Optional[Dict[str, OperatorConfig]] = None,
|
|
faker_seed: Optional[int] = None,
|
|
):
|
|
super().__init__(analyzed_fields, operators, faker_seed)
|
|
self._deanonymizer_mapping = DeanonymizerMapping()
|
|
|
|
@property
|
|
def deanonymizer_mapping(self) -> MappingDataType:
|
|
"""Return the deanonymizer mapping"""
|
|
return self._deanonymizer_mapping.data
|
|
|
|
def _update_deanonymizer_mapping(
|
|
self,
|
|
original_text: str,
|
|
analyzer_results: List[RecognizerResult],
|
|
anonymizer_results: EngineResult,
|
|
) -> None:
|
|
"""Creates or updates the mapping used to de-anonymize text.
|
|
|
|
This method exploits the results returned by the
|
|
analysis and anonymization processes.
|
|
|
|
It constructs a mapping from each anonymized entity
|
|
back to its original text value.
|
|
|
|
Mapping will be stored as "deanonymizer_mapping" property.
|
|
|
|
Example of "deanonymizer_mapping":
|
|
{
|
|
"PERSON": {
|
|
"<anonymized>": "<original>",
|
|
"John Doe": "Slim Shady"
|
|
},
|
|
"PHONE_NUMBER": {
|
|
"111-111-1111": "555-555-5555"
|
|
}
|
|
...
|
|
}
|
|
"""
|
|
|
|
# We are able to zip and loop through both lists because we expect
|
|
# them to return corresponding entities for each identified piece
|
|
# of analyzable data from our input.
|
|
|
|
# We sort them by their 'start' attribute because it allows us to
|
|
# match corresponding entities by their position in the input text.
|
|
analyzer_results = sorted(analyzer_results, key=lambda d: d.start)
|
|
anonymizer_results.items = sorted(
|
|
anonymizer_results.items, key=lambda d: d.start
|
|
)
|
|
|
|
new_deanonymizer_mapping: MappingDataType = defaultdict(dict)
|
|
|
|
for analyzed_entity, anonymized_entity in zip(
|
|
analyzer_results, anonymizer_results.items
|
|
):
|
|
original_value = original_text[analyzed_entity.start : analyzed_entity.end]
|
|
new_deanonymizer_mapping[anonymized_entity.entity_type][
|
|
anonymized_entity.text
|
|
] = original_value
|
|
|
|
self._deanonymizer_mapping.update(new_deanonymizer_mapping)
|
|
|
|
def _anonymize(self, text: str) -> str:
|
|
"""Anonymize text.
|
|
Each PII entity is replaced with a fake value.
|
|
Each time fake values will be different, as they are generated randomly.
|
|
At the same time, we will create a mapping from each anonymized entity
|
|
back to its original text value.
|
|
|
|
Args:
|
|
text: text to anonymize
|
|
"""
|
|
analyzer_results = self._analyzer.analyze(
|
|
text,
|
|
entities=self.analyzed_fields,
|
|
language="en",
|
|
)
|
|
|
|
filtered_analyzer_results = (
|
|
self._anonymizer._remove_conflicts_and_get_text_manipulation_data(
|
|
analyzer_results
|
|
)
|
|
)
|
|
|
|
anonymizer_results = self._anonymizer.anonymize(
|
|
text,
|
|
analyzer_results=analyzer_results,
|
|
operators=self.operators,
|
|
)
|
|
|
|
self._update_deanonymizer_mapping(
|
|
text, filtered_analyzer_results, anonymizer_results
|
|
)
|
|
|
|
return anonymizer_results.text
|
|
|
|
def _deanonymize(
|
|
self,
|
|
text_to_deanonymize: str,
|
|
deanonymizer_matching_strategy: Callable[
|
|
[str, MappingDataType], str
|
|
] = default_matching_strategy,
|
|
) -> str:
|
|
"""Deanonymize text.
|
|
Each anonymized entity is replaced with its original value.
|
|
This method exploits the mapping created during the anonymization process.
|
|
|
|
Args:
|
|
text_to_deanonymize: text to deanonymize
|
|
deanonymizer_matching_strategy: function to use to match
|
|
anonymized entities with their original values and replace them.
|
|
"""
|
|
if not self._deanonymizer_mapping:
|
|
raise ValueError(
|
|
"Deanonymizer mapping is empty.",
|
|
"Please call anonymize() and anonymize some text first.",
|
|
)
|
|
|
|
text_to_deanonymize = deanonymizer_matching_strategy(
|
|
text_to_deanonymize, self.deanonymizer_mapping
|
|
)
|
|
|
|
return text_to_deanonymize
|
|
|
|
def save_deanonymizer_mapping(self, file_path: Union[Path, str]) -> None:
|
|
"""Save the deanonymizer mapping to a JSON or YAML file.
|
|
|
|
Args:
|
|
file_path: Path to file to save the mapping to.
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
anonymizer.save_deanonymizer_mapping(file_path="path/mapping.json")
|
|
"""
|
|
|
|
save_path = Path(file_path)
|
|
|
|
if save_path.suffix not in [".json", ".yaml"]:
|
|
raise ValueError(f"{save_path} must have an extension of .json or .yaml")
|
|
|
|
# Make sure parent directories exist
|
|
save_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
if save_path.suffix == ".json":
|
|
with open(save_path, "w") as f:
|
|
json.dump(self.deanonymizer_mapping, f, indent=2)
|
|
elif save_path.suffix == ".yaml":
|
|
with open(save_path, "w") as f:
|
|
yaml.dump(self.deanonymizer_mapping, f, default_flow_style=False)
|
|
|
|
def load_deanonymizer_mapping(self, file_path: Union[Path, str]) -> None:
|
|
"""Load the deanonymizer mapping from a JSON or YAML file.
|
|
|
|
Args:
|
|
file_path: Path to file to load the mapping from.
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
anonymizer.load_deanonymizer_mapping(file_path="path/mapping.json")
|
|
"""
|
|
|
|
load_path = Path(file_path)
|
|
|
|
if load_path.suffix not in [".json", ".yaml"]:
|
|
raise ValueError(f"{load_path} must have an extension of .json or .yaml")
|
|
|
|
if load_path.suffix == ".json":
|
|
with open(load_path, "r") as f:
|
|
loaded_mapping = json.load(f)
|
|
elif load_path.suffix == ".yaml":
|
|
with open(load_path, "r") as f:
|
|
loaded_mapping = yaml.load(f, Loader=yaml.FullLoader)
|
|
|
|
self._deanonymizer_mapping.update(loaded_mapping)
|