experimental[patch]: missing resolution strategy in anonymization (#16653)

- **Description:** Presidio-based anonymizers are not working because
`_remove_conflicts_and_get_text_manipulation_data` was being called
without a conflict resolution strategy. This PR fixes this issue. In
addition, it removes some mutable default arguments (antipattern).
 
To reproduce the issue, just run the very first cell of this
[notebook](https://python.langchain.com/docs/guides/privacy/2/) from
langchain's documentation.

<!-- Thank you for contributing to LangChain!

Please title your PR "<package>: <description>", where <package> is
whichever of langchain, community, core, experimental, etc. is being
modified.

Replace this entire comment with:
  - **Description:** a description of the change, 
  - **Issue:** the issue # it fixes if applicable,
  - **Dependencies:** any dependencies required for this change,
- **Twitter handle:** we announce bigger features on Twitter. If your PR
gets announced, and you'd like a mention, we'll gladly shout you out!

Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` from the root
of the package you've modified to check this locally.

See contribution guidelines for more information on how to write/run
tests, lint, etc: https://python.langchain.com/docs/contributing/

If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.

If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
 -->
pull/16440/head
Massimiliano Pronesti 5 months ago committed by GitHub
parent 8e44363ec9
commit 1bc8d9a943
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -27,7 +27,7 @@ if TYPE_CHECKING:
from presidio_analyzer import AnalyzerEngine, EntityRecognizer
from presidio_analyzer.nlp_engine import NlpEngineProvider
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
from presidio_anonymizer.entities import ConflictResolutionStrategy, OperatorConfig
def _import_analyzer_engine() -> "AnalyzerEngine":
@ -102,7 +102,7 @@ class PresidioAnonymizerBase(AnonymizerBase):
self,
analyzed_fields: Optional[List[str]] = None,
operators: Optional[Dict[str, OperatorConfig]] = None,
languages_config: Dict = DEFAULT_LANGUAGES_CONFIG,
languages_config: Optional[Dict] = None,
add_default_faker_operators: bool = True,
faker_seed: Optional[int] = None,
):
@ -123,6 +123,8 @@ class PresidioAnonymizerBase(AnonymizerBase):
Defaults to None, in which case faker will be seeded randomly
and provide random values.
"""
if languages_config is None:
languages_config = DEFAULT_LANGUAGES_CONFIG
OperatorConfig = _import_operator_config()
AnalyzerEngine = _import_analyzer_engine()
NlpEngineProvider = _import_nlp_engine_provider()
@ -183,6 +185,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
text: str,
language: Optional[str] = None,
allow_list: Optional[List[str]] = None,
conflict_resolution: Optional[ConflictResolutionStrategy] = None,
) -> str:
"""Anonymize text.
Each PII entity is replaced with a fake value.
@ -204,8 +207,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
"""
if language is None:
language = self.supported_languages[0]
if language not in self.supported_languages:
elif language not in self.supported_languages:
raise ValueError(
f"Language '{language}' is not supported. "
f"Supported languages are: {self.supported_languages}. "
@ -237,7 +239,7 @@ class PresidioAnonymizer(PresidioAnonymizerBase):
filtered_analyzer_results = (
self._anonymizer._remove_conflicts_and_get_text_manipulation_data(
analyzer_results
analyzer_results, conflict_resolution
)
)
@ -260,10 +262,12 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
self,
analyzed_fields: Optional[List[str]] = None,
operators: Optional[Dict[str, OperatorConfig]] = None,
languages_config: Dict = DEFAULT_LANGUAGES_CONFIG,
languages_config: Optional[Dict] = None,
add_default_faker_operators: bool = True,
faker_seed: Optional[int] = None,
):
if languages_config is None:
languages_config = DEFAULT_LANGUAGES_CONFIG
super().__init__(
analyzed_fields,
operators,
@ -292,6 +296,7 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
text: str,
language: Optional[str] = None,
allow_list: Optional[List[str]] = None,
conflict_resolution: Optional[ConflictResolutionStrategy] = None,
) -> str:
"""Anonymize text.
Each PII entity is replaced with a fake value.
@ -348,7 +353,7 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB
filtered_analyzer_results = (
self._anonymizer._remove_conflicts_and_get_text_manipulation_data(
analyzer_results
analyzer_results, conflict_resolution
)
)

File diff suppressed because it is too large Load Diff

@ -12,8 +12,8 @@ repository = "https://github.com/langchain-ai/langchain"
python = ">=3.8.1,<4.0"
langchain-core = "^0.1.7"
langchain = "^0.1"
presidio-anonymizer = {version = "^2.2.33", optional = true}
presidio-analyzer = {version = "^2.2.33", optional = true}
presidio-anonymizer = {version = "^2.2.352", optional = true}
presidio-analyzer = {version = "^2.2.352", optional = true}
faker = {version = "^19.3.1", optional = true}
vowpal-wabbit-next = {version = "0.6.0", optional = true}
sentence-transformers = {version = "^2", optional = true}

Loading…
Cancel
Save