experimental[patch]: missing resolution strategy in anonymization (#16653)

- **Description:** Presidio-based anonymizers are not working because
`_remove_conflicts_and_get_text_manipulation_data` was being called
without a conflict resolution strategy. This PR fixes this issue. In
addition, it removes some mutable default arguments (antipattern).
 
To reproduce the issue, just run the very first cell of this
[notebook](https://python.langchain.com/docs/guides/privacy/2/) from
langchain's documentation.

<!-- Thank you for contributing to LangChain!

Please title your PR "<package>: <description>", where <package> is
whichever of langchain, community, core, experimental, etc. is being
modified.

Replace this entire comment with:
  - **Description:** a description of the change, 
  - **Issue:** the issue # it fixes if applicable,
  - **Dependencies:** any dependencies required for this change,
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!

Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` from the root
of the package you've modified to check this locally.

See contribution guidelines for more information on how to write/run
tests, lint, etc: https://python.langchain.com/docs/contributing/

If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.

If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
 -->
pull/16440/head
Massimiliano Pronesti 8 months ago committed by GitHub
parent 8e44363ec9
commit 1bc8d9a943
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -27,7 +27,7 @@ if TYPE_CHECKING:
from presidio_analyzer import AnalyzerEngine, EntityRecognizer from presidio_analyzer import AnalyzerEngine, EntityRecognizer
from presidio_analyzer.nlp_engine import NlpEngineProvider from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig from presidio_anonymizer.entities import ConflictResolutionStrategy, OperatorConfig
def _import_analyzer_engine() -> "AnalyzerEngine": def _import_analyzer_engine() -> "AnalyzerEngine":
@ -102,7 +102,7 @@ class PresidioAnonymizerBase(AnonymizerBase):
self, self,
analyzed_fields: Optional[List[str]] = None, analyzed_fields: Optional[List[str]] = None,
operators: Optional[Dict[str, OperatorConfig]] = None, operators: Optional[Dict[str, OperatorConfig]] = None,
languages_config: Dict = DEFAULT_LANGUAGES_CONFIG, languages_config: Optional[Dict] = None,
add_default_faker_operators: bool = True, add_default_faker_operators: bool = True,
faker_seed: Optional[int] = None, faker_seed: Optional[int] = None,
): ):
@ -123,6 +123,8 @@ class PresidioAnonymizerBase(AnonymizerBase):
Defaults to None, in which case faker will be seeded randomly Defaults to None, in which case faker will be seeded randomly
and provide random values. and provide random values.
""" """
if languages_config is None:
languages_config = DEFAULT_LANGUAGES_CONFIG
OperatorConfig = _import_operator_config() OperatorConfig = _import_operator_config()
AnalyzerEngine = _import_analyzer_engine() AnalyzerEngine = _import_analyzer_engine()
NlpEngineProvider = _import_nlp_engine_provider() NlpEngineProvider = _import_nlp_engine_provider()
@ -183,6 +185,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
text: str, text: str,
language: Optional[str] = None, language: Optional[str] = None,
allow_list: Optional[List[str]] = None, allow_list: Optional[List[str]] = None,
conflict_resolution: Optional[ConflictResolutionStrategy] = None,
) -> str: ) -> str:
"""Anonymize text. """Anonymize text.
Each PII entity is replaced with a fake value. Each PII entity is replaced with a fake value.
@ -204,8 +207,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
""" """
if language is None: if language is None:
language = self.supported_languages[0] language = self.supported_languages[0]
elif language not in self.supported_languages:
if language not in self.supported_languages:
raise ValueError( raise ValueError(
f"Language '{language}' is not supported. " f"Language '{language}' is not supported. "
f"Supported languages are: {self.supported_languages}. " f"Supported languages are: {self.supported_languages}. "
@ -237,7 +239,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
filtered_analyzer_results = ( filtered_analyzer_results = (
self._anonymizer._remove_conflicts_and_get_text_manipulation_data( self._anonymizer._remove_conflicts_and_get_text_manipulation_data(
analyzer_results analyzer_results, conflict_resolution
) )
) )
@ -260,10 +262,12 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
self, self,
analyzed_fields: Optional[List[str]] = None, analyzed_fields: Optional[List[str]] = None,
operators: Optional[Dict[str, OperatorConfig]] = None, operators: Optional[Dict[str, OperatorConfig]] = None,
languages_config: Dict = DEFAULT_LANGUAGES_CONFIG, languages_config: Optional[Dict] = None,
add_default_faker_operators: bool = True, add_default_faker_operators: bool = True,
faker_seed: Optional[int] = None, faker_seed: Optional[int] = None,
): ):
if languages_config is None:
languages_config = DEFAULT_LANGUAGES_CONFIG
super().__init__( super().__init__(
analyzed_fields, analyzed_fields,
operators, operators,
@ -292,6 +296,7 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
text: str, text: str,
language: Optional[str] = None, language: Optional[str] = None,
allow_list: Optional[List[str]] = None, allow_list: Optional[List[str]] = None,
conflict_resolution: Optional[ConflictResolutionStrategy] = None,
) -> str: ) -> str:
"""Anonymize text. """Anonymize text.
Each PII entity is replaced with a fake value. Each PII entity is replaced with a fake value.
@ -348,7 +353,7 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
filtered_analyzer_results = ( filtered_analyzer_results = (
self._anonymizer._remove_conflicts_and_get_text_manipulation_data( self._anonymizer._remove_conflicts_and_get_text_manipulation_data(
analyzer_results analyzer_results, conflict_resolution
) )
) )

File diff suppressed because it is too large Load Diff

@ -12,8 +12,8 @@ repository = "https://github.com/langchain-ai/langchain"
python = ">=3.8.1,<4.0" python = ">=3.8.1,<4.0"
langchain-core = "^0.1.7" langchain-core = "^0.1.7"
langchain = "^0.1" langchain = "^0.1"
presidio-anonymizer = {version = "^2.2.33", optional = true} presidio-anonymizer = {version = "^2.2.352", optional = true}
presidio-analyzer = {version = "^2.2.33", optional = true} presidio-analyzer = {version = "^2.2.352", optional = true}
faker = {version = "^19.3.1", optional = true} faker = {version = "^19.3.1", optional = true}
vowpal-wabbit-next = {version = "0.6.0", optional = true} vowpal-wabbit-next = {version = "0.6.0", optional = true}
sentence-transformers = {version = "^2", optional = true} sentence-transformers = {version = "^2", optional = true}

Loading…
Cancel
Save