mirror of
https://github.com/hwchase17/langchain
synced 2024-10-29 17:07:25 +00:00
274c3dc3a8
### Description Add multiple language support to Anonymizer PII detection in Microsoft Presidio relies on several components - in addition to the usual pattern matching (e.g. using regex), the analyser uses a model for Named Entity Recognition (NER) to extract entities such as: - `PERSON` - `LOCATION` - `DATE_TIME` - `NRP` - `ORGANIZATION` [[Source]](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py) To handle NER in specific languages, we utilize unique models from the `spaCy` library, recognized for its extensive selection covering multiple languages and sizes. However, it's not restrictive, allowing for integration of alternative frameworks such as [Stanza](https://microsoft.github.io/presidio/analyzer/nlp_engines/spacy_stanza/) or [transformers](https://microsoft.github.io/presidio/analyzer/nlp_engines/transformers/) when necessary. ### Future works - **automatic language detection** - instead of passing the language as a parameter in `anonymizer.anonymize`, we could detect the language/s beforehand and then use the corresponding NER model. We have discussed this internally and @mateusz-wosinski-ds will look into a standalone language detection tool/chain for LangChain 😄 ### Twitter handle @deepsense_ai / @MaksOpp ### Tag maintainer @baskaryan @hwchase17 @hinthornw
367 lines
13 KiB
Python
367 lines
13 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
from collections import defaultdict
|
|
from pathlib import Path
|
|
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
|
|
|
|
import yaml
|
|
|
|
from langchain_experimental.data_anonymizer.base import (
|
|
AnonymizerBase,
|
|
ReversibleAnonymizerBase,
|
|
)
|
|
from langchain_experimental.data_anonymizer.deanonymizer_mapping import (
|
|
DeanonymizerMapping,
|
|
MappingDataType,
|
|
)
|
|
from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
|
|
default_matching_strategy,
|
|
)
|
|
from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
|
|
get_pseudoanonymizer_mapping,
|
|
)
|
|
|
|
try:
|
|
from presidio_analyzer import AnalyzerEngine
|
|
from presidio_analyzer.nlp_engine import NlpEngineProvider
|
|
|
|
except ImportError as e:
|
|
raise ImportError(
|
|
"Could not import presidio_analyzer, please install with "
|
|
"`pip install presidio-analyzer`. You will also need to download a "
|
|
"spaCy model to use the analyzer, e.g. "
|
|
"`python -m spacy download en_core_web_lg`."
|
|
) from e
|
|
try:
|
|
from presidio_anonymizer import AnonymizerEngine
|
|
from presidio_anonymizer.entities import OperatorConfig
|
|
except ImportError as e:
|
|
raise ImportError(
|
|
"Could not import presidio_anonymizer, please install with "
|
|
"`pip install presidio-anonymizer`."
|
|
) from e
|
|
|
|
if TYPE_CHECKING:
|
|
from presidio_analyzer import EntityRecognizer, RecognizerResult
|
|
from presidio_anonymizer.entities import EngineResult
|
|
|
|
# Configuring Anonymizer for multiple languages
|
|
# Detailed description and examples can be found here:
|
|
# langchain/docs/extras/guides/privacy/multi_language_anonymization.ipynb
|
|
DEFAULT_LANGUAGES_CONFIG = {
|
|
# You can also use Stanza or transformers library.
|
|
# See https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/
|
|
"nlp_engine_name": "spacy",
|
|
"models": [
|
|
{"lang_code": "en", "model_name": "en_core_web_lg"},
|
|
# {"lang_code": "de", "model_name": "de_core_news_md"},
|
|
# {"lang_code": "es", "model_name": "es_core_news_md"},
|
|
# ...
|
|
# List of available models: https://spacy.io/usage/models
|
|
],
|
|
}
|
|
|
|
|
|
class PresidioAnonymizerBase(AnonymizerBase):
|
|
def __init__(
|
|
self,
|
|
analyzed_fields: Optional[List[str]] = None,
|
|
operators: Optional[Dict[str, OperatorConfig]] = None,
|
|
languages_config: Dict = DEFAULT_LANGUAGES_CONFIG,
|
|
faker_seed: Optional[int] = None,
|
|
):
|
|
"""
|
|
Args:
|
|
analyzed_fields: List of fields to detect and then anonymize.
|
|
Defaults to all entities supported by Microsoft Presidio.
|
|
operators: Operators to use for anonymization.
|
|
Operators allow for custom anonymization of detected PII.
|
|
Learn more:
|
|
https://microsoft.github.io/presidio/tutorial/10_simple_anonymization/
|
|
languages_config: Configuration for the NLP engine.
|
|
First language in the list will be used as the main language
|
|
in self.anonymize(...) when no language is specified.
|
|
Learn more:
|
|
https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/
|
|
faker_seed: Seed used to initialize faker.
|
|
Defaults to None, in which case faker will be seeded randomly
|
|
and provide random values.
|
|
"""
|
|
self.analyzed_fields = (
|
|
analyzed_fields
|
|
if analyzed_fields is not None
|
|
else list(get_pseudoanonymizer_mapping().keys())
|
|
)
|
|
self.operators = (
|
|
operators
|
|
if operators is not None
|
|
else {
|
|
field: OperatorConfig(
|
|
operator_name="custom", params={"lambda": faker_function}
|
|
)
|
|
for field, faker_function in get_pseudoanonymizer_mapping(
|
|
faker_seed
|
|
).items()
|
|
}
|
|
)
|
|
|
|
provider = NlpEngineProvider(nlp_configuration=languages_config)
|
|
nlp_engine = provider.create_engine()
|
|
|
|
self.supported_languages = list(nlp_engine.nlp.keys())
|
|
|
|
self._analyzer = AnalyzerEngine(
|
|
supported_languages=self.supported_languages, nlp_engine=nlp_engine
|
|
)
|
|
self._anonymizer = AnonymizerEngine()
|
|
|
|
def add_recognizer(self, recognizer: EntityRecognizer) -> None:
|
|
"""Add a recognizer to the analyzer
|
|
|
|
Args:
|
|
recognizer: Recognizer to add to the analyzer.
|
|
"""
|
|
self._analyzer.registry.add_recognizer(recognizer)
|
|
self.analyzed_fields.extend(recognizer.supported_entities)
|
|
|
|
def add_operators(self, operators: Dict[str, OperatorConfig]) -> None:
|
|
"""Add operators to the anonymizer
|
|
|
|
Args:
|
|
operators: Operators to add to the anonymizer.
|
|
"""
|
|
self.operators.update(operators)
|
|
|
|
|
|
class PresidioAnonymizer(PresidioAnonymizerBase):
|
|
def _anonymize(self, text: str, language: Optional[str] = None) -> str:
|
|
"""Anonymize text.
|
|
Each PII entity is replaced with a fake value.
|
|
Each time fake values will be different, as they are generated randomly.
|
|
|
|
Args:
|
|
text: text to anonymize
|
|
language: language to use for analysis of PII
|
|
If None, the first (main) language in the list
|
|
of languages specified in the configuration will be used.
|
|
"""
|
|
if language is None:
|
|
language = self.supported_languages[0]
|
|
|
|
if language not in self.supported_languages:
|
|
raise ValueError(
|
|
f"Language '{language}' is not supported. "
|
|
f"Supported languages are: {self.supported_languages}. "
|
|
"Change your language configuration file to add more languages."
|
|
)
|
|
|
|
results = self._analyzer.analyze(
|
|
text,
|
|
entities=self.analyzed_fields,
|
|
language=language,
|
|
)
|
|
|
|
return self._anonymizer.anonymize(
|
|
text,
|
|
analyzer_results=results,
|
|
operators=self.operators,
|
|
).text
|
|
|
|
|
|
class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase):
|
|
def __init__(
|
|
self,
|
|
analyzed_fields: Optional[List[str]] = None,
|
|
operators: Optional[Dict[str, OperatorConfig]] = None,
|
|
languages_config: Dict = DEFAULT_LANGUAGES_CONFIG,
|
|
faker_seed: Optional[int] = None,
|
|
):
|
|
super().__init__(analyzed_fields, operators, languages_config, faker_seed)
|
|
self._deanonymizer_mapping = DeanonymizerMapping()
|
|
|
|
@property
|
|
def deanonymizer_mapping(self) -> MappingDataType:
|
|
"""Return the deanonymizer mapping"""
|
|
return self._deanonymizer_mapping.data
|
|
|
|
def _update_deanonymizer_mapping(
|
|
self,
|
|
original_text: str,
|
|
analyzer_results: List[RecognizerResult],
|
|
anonymizer_results: EngineResult,
|
|
) -> None:
|
|
"""Creates or updates the mapping used to de-anonymize text.
|
|
|
|
This method exploits the results returned by the
|
|
analysis and anonymization processes.
|
|
|
|
It constructs a mapping from each anonymized entity
|
|
back to its original text value.
|
|
|
|
Mapping will be stored as "deanonymizer_mapping" property.
|
|
|
|
Example of "deanonymizer_mapping":
|
|
{
|
|
"PERSON": {
|
|
"<anonymized>": "<original>",
|
|
"John Doe": "Slim Shady"
|
|
},
|
|
"PHONE_NUMBER": {
|
|
"111-111-1111": "555-555-5555"
|
|
}
|
|
...
|
|
}
|
|
"""
|
|
|
|
# We are able to zip and loop through both lists because we expect
|
|
# them to return corresponding entities for each identified piece
|
|
# of analyzable data from our input.
|
|
|
|
# We sort them by their 'start' attribute because it allows us to
|
|
# match corresponding entities by their position in the input text.
|
|
analyzer_results = sorted(analyzer_results, key=lambda d: d.start)
|
|
anonymizer_results.items = sorted(
|
|
anonymizer_results.items, key=lambda d: d.start
|
|
)
|
|
|
|
new_deanonymizer_mapping: MappingDataType = defaultdict(dict)
|
|
|
|
for analyzed_entity, anonymized_entity in zip(
|
|
analyzer_results, anonymizer_results.items
|
|
):
|
|
original_value = original_text[analyzed_entity.start : analyzed_entity.end]
|
|
new_deanonymizer_mapping[anonymized_entity.entity_type][
|
|
anonymized_entity.text
|
|
] = original_value
|
|
|
|
self._deanonymizer_mapping.update(new_deanonymizer_mapping)
|
|
|
|
def _anonymize(self, text: str, language: Optional[str] = None) -> str:
|
|
"""Anonymize text.
|
|
Each PII entity is replaced with a fake value.
|
|
Each time fake values will be different, as they are generated randomly.
|
|
At the same time, we will create a mapping from each anonymized entity
|
|
back to its original text value.
|
|
|
|
Args:
|
|
text: text to anonymize
|
|
language: language to use for analysis of PII
|
|
If None, the first (main) language in the list
|
|
of languages specified in the configuration will be used.
|
|
"""
|
|
if language is None:
|
|
language = self.supported_languages[0]
|
|
|
|
if language not in self.supported_languages:
|
|
raise ValueError(
|
|
f"Language '{language}' is not supported. "
|
|
f"Supported languages are: {self.supported_languages}. "
|
|
"Change your language configuration file to add more languages."
|
|
)
|
|
|
|
analyzer_results = self._analyzer.analyze(
|
|
text,
|
|
entities=self.analyzed_fields,
|
|
language=language,
|
|
)
|
|
|
|
filtered_analyzer_results = (
|
|
self._anonymizer._remove_conflicts_and_get_text_manipulation_data(
|
|
analyzer_results
|
|
)
|
|
)
|
|
|
|
anonymizer_results = self._anonymizer.anonymize(
|
|
text,
|
|
analyzer_results=analyzer_results,
|
|
operators=self.operators,
|
|
)
|
|
|
|
self._update_deanonymizer_mapping(
|
|
text, filtered_analyzer_results, anonymizer_results
|
|
)
|
|
|
|
return anonymizer_results.text
|
|
|
|
def _deanonymize(
|
|
self,
|
|
text_to_deanonymize: str,
|
|
deanonymizer_matching_strategy: Callable[
|
|
[str, MappingDataType], str
|
|
] = default_matching_strategy,
|
|
) -> str:
|
|
"""Deanonymize text.
|
|
Each anonymized entity is replaced with its original value.
|
|
This method exploits the mapping created during the anonymization process.
|
|
|
|
Args:
|
|
text_to_deanonymize: text to deanonymize
|
|
deanonymizer_matching_strategy: function to use to match
|
|
anonymized entities with their original values and replace them.
|
|
"""
|
|
if not self._deanonymizer_mapping:
|
|
raise ValueError(
|
|
"Deanonymizer mapping is empty.",
|
|
"Please call anonymize() and anonymize some text first.",
|
|
)
|
|
|
|
text_to_deanonymize = deanonymizer_matching_strategy(
|
|
text_to_deanonymize, self.deanonymizer_mapping
|
|
)
|
|
|
|
return text_to_deanonymize
|
|
|
|
def save_deanonymizer_mapping(self, file_path: Union[Path, str]) -> None:
|
|
"""Save the deanonymizer mapping to a JSON or YAML file.
|
|
|
|
Args:
|
|
file_path: Path to file to save the mapping to.
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
anonymizer.save_deanonymizer_mapping(file_path="path/mapping.json")
|
|
"""
|
|
|
|
save_path = Path(file_path)
|
|
|
|
if save_path.suffix not in [".json", ".yaml"]:
|
|
raise ValueError(f"{save_path} must have an extension of .json or .yaml")
|
|
|
|
# Make sure parent directories exist
|
|
save_path.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
if save_path.suffix == ".json":
|
|
with open(save_path, "w") as f:
|
|
json.dump(self.deanonymizer_mapping, f, indent=2)
|
|
elif save_path.suffix == ".yaml":
|
|
with open(save_path, "w") as f:
|
|
yaml.dump(self.deanonymizer_mapping, f, default_flow_style=False)
|
|
|
|
def load_deanonymizer_mapping(self, file_path: Union[Path, str]) -> None:
|
|
"""Load the deanonymizer mapping from a JSON or YAML file.
|
|
|
|
Args:
|
|
file_path: Path to file to load the mapping from.
|
|
|
|
Example:
|
|
.. code-block:: python
|
|
|
|
anonymizer.load_deanonymizer_mapping(file_path="path/mapping.json")
|
|
"""
|
|
|
|
load_path = Path(file_path)
|
|
|
|
if load_path.suffix not in [".json", ".yaml"]:
|
|
raise ValueError(f"{load_path} must have an extension of .json or .yaml")
|
|
|
|
if load_path.suffix == ".json":
|
|
with open(load_path, "r") as f:
|
|
loaded_mapping = json.load(f)
|
|
elif load_path.suffix == ".yaml":
|
|
with open(load_path, "r") as f:
|
|
loaded_mapping = yaml.load(f, Loader=yaml.FullLoader)
|
|
|
|
self._deanonymizer_mapping.update(loaded_mapping)
|