2023-07-21 17:36:28 +00:00
|
|
|
[tool.poetry]
|
2023-07-21 20:32:39 +00:00
|
|
|
name = "langchain-experimental"
|
2023-11-07 19:49:23 +00:00
|
|
|
version = "0.0.38"
|
2023-07-21 17:36:28 +00:00
|
|
|
description = "Building applications with LLMs through composability"
|
|
|
|
authors = []
|
|
|
|
license = "MIT"
|
|
|
|
readme = "README.md"
|
2023-08-18 16:55:43 +00:00
|
|
|
repository = "https://github.com/langchain-ai/langchain"
|
2023-07-21 17:36:28 +00:00
|
|
|
|
|
|
|
|
|
|
|
[tool.poetry.dependencies]
|
|
|
|
python = ">=3.8.1,<4.0"
|
2023-10-05 02:36:58 +00:00
|
|
|
langchain = ">=0.0.308"
|
Add data anonymizer (#9863)
### Description
The feature for anonymizing data has been implemented. In order to
protect private data, such as when querying external APIs (OpenAI), it
is worth pseudonymizing sensitive data to maintain full privacy.
Anonynization consists of two steps:
1. **Identification:** Identify all data fields that contain personally
identifiable information (PII).
2. **Replacement**: Replace all PIIs with pseudo values or codes that do
not reveal any personal information about the individual but can be used
for reference. We're not using regular encryption, because the language
model won't be able to understand the meaning or context of the
encrypted data.
We use *Microsoft Presidio* together with *Faker* framework for
anonymization purposes because of the wide range of functionalities they
provide. The full implementation is available in `PresidioAnonymizer`.
### Future works
- **deanonymization** - add the ability to reverse anonymization. For
example, the workflow could look like this: `anonymize -> LLMChain ->
deanonymize`. By doing this, we will retain anonymity in requests to,
for example, OpenAI, and then be able restore the original data.
- **instance anonymization** - at this point, each occurrence of PII is
treated as a separate entity and separately anonymized. Therefore, two
occurrences of the name John Doe in the text will be changed to two
different names. It is therefore worth introducing support for full
instance detection, so that repeated occurrences are treated as a single
object.
### Twitter handle
@deepsense_ai / @MaksOpp
---------
Co-authored-by: MaksOpp <maks.operlejn@gmail.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
2023-08-30 17:39:44 +00:00
|
|
|
presidio-anonymizer = {version = "^2.2.33", optional = true}
|
|
|
|
presidio-analyzer = {version = "^2.2.33", optional = true}
|
|
|
|
faker = {version = "^19.3.1", optional = true}
|
2023-09-11 16:16:08 +00:00
|
|
|
vowpal-wabbit-next = {version = "0.6.0", optional = true}
|
|
|
|
sentence-transformers = {version = "^2", optional = true}
|
2023-07-21 17:36:28 +00:00
|
|
|
|
|
|
|
[tool.poetry.group.lint.dependencies]
|
2023-10-31 14:53:12 +00:00
|
|
|
ruff = "^0.1.3"
|
2023-07-21 17:36:28 +00:00
|
|
|
|
|
|
|
[tool.poetry.group.typing.dependencies]
|
|
|
|
mypy = "^0.991"
|
2023-07-21 20:32:39 +00:00
|
|
|
types-pyyaml = "^6.0.12.2"
|
2023-09-06 20:32:59 +00:00
|
|
|
types-requests = "^2.28.11.5"
|
2023-07-21 17:36:28 +00:00
|
|
|
|
|
|
|
[tool.poetry.group.dev.dependencies]
|
|
|
|
jupyter = "^1.0.0"
|
|
|
|
setuptools = "^67.6.1"
|
|
|
|
|
|
|
|
[tool.poetry.group.test.dependencies]
|
|
|
|
# The only dependencies that should be added are
|
|
|
|
# dependencies used for running tests (e.g., pytest, freezegun, response).
|
|
|
|
# Any dependencies that do not meet that criteria will be removed.
|
|
|
|
pytest = "^7.3.0"
|
|
|
|
|
2023-10-21 14:52:18 +00:00
|
|
|
|
|
|
|
[tool.poetry.group.test_integration]
|
|
|
|
optional = true
|
|
|
|
dependencies = {}
|
|
|
|
|
Add data anonymizer (#9863)
### Description
The feature for anonymizing data has been implemented. In order to
protect private data, such as when querying external APIs (OpenAI), it
is worth pseudonymizing sensitive data to maintain full privacy.
Anonynization consists of two steps:
1. **Identification:** Identify all data fields that contain personally
identifiable information (PII).
2. **Replacement**: Replace all PIIs with pseudo values or codes that do
not reveal any personal information about the individual but can be used
for reference. We're not using regular encryption, because the language
model won't be able to understand the meaning or context of the
encrypted data.
We use *Microsoft Presidio* together with *Faker* framework for
anonymization purposes because of the wide range of functionalities they
provide. The full implementation is available in `PresidioAnonymizer`.
### Future works
- **deanonymization** - add the ability to reverse anonymization. For
example, the workflow could look like this: `anonymize -> LLMChain ->
deanonymize`. By doing this, we will retain anonymity in requests to,
for example, OpenAI, and then be able restore the original data.
- **instance anonymization** - at this point, each occurrence of PII is
treated as a separate entity and separately anonymized. Therefore, two
occurrences of the name John Doe in the text will be changed to two
different names. It is therefore worth introducing support for full
instance detection, so that repeated occurrences are treated as a single
object.
### Twitter handle
@deepsense_ai / @MaksOpp
---------
Co-authored-by: MaksOpp <maks.operlejn@gmail.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
2023-08-30 17:39:44 +00:00
|
|
|
# An extra used to be able to add extended testing.
|
|
|
|
# Please use new-line on formatting to make it easier to add new packages without
|
|
|
|
# merge-conflicts
|
|
|
|
[tool.poetry.extras]
|
|
|
|
extended_testing = [
|
|
|
|
"presidio-anonymizer",
|
|
|
|
"presidio-analyzer",
|
|
|
|
"faker",
|
2023-09-11 16:16:08 +00:00
|
|
|
"vowpal-wabbit-next",
|
|
|
|
"sentence-transformers",
|
Add data anonymizer (#9863)
### Description
The feature for anonymizing data has been implemented. In order to
protect private data, such as when querying external APIs (OpenAI), it
is worth pseudonymizing sensitive data to maintain full privacy.
Anonynization consists of two steps:
1. **Identification:** Identify all data fields that contain personally
identifiable information (PII).
2. **Replacement**: Replace all PIIs with pseudo values or codes that do
not reveal any personal information about the individual but can be used
for reference. We're not using regular encryption, because the language
model won't be able to understand the meaning or context of the
encrypted data.
We use *Microsoft Presidio* together with *Faker* framework for
anonymization purposes because of the wide range of functionalities they
provide. The full implementation is available in `PresidioAnonymizer`.
### Future works
- **deanonymization** - add the ability to reverse anonymization. For
example, the workflow could look like this: `anonymize -> LLMChain ->
deanonymize`. By doing this, we will retain anonymity in requests to,
for example, OpenAI, and then be able restore the original data.
- **instance anonymization** - at this point, each occurrence of PII is
treated as a separate entity and separately anonymized. Therefore, two
occurrences of the name John Doe in the text will be changed to two
different names. It is therefore worth introducing support for full
instance detection, so that repeated occurrences are treated as a single
object.
### Twitter handle
@deepsense_ai / @MaksOpp
---------
Co-authored-by: MaksOpp <maks.operlejn@gmail.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
2023-08-30 17:39:44 +00:00
|
|
|
]
|
|
|
|
|
2023-07-21 17:36:28 +00:00
|
|
|
[tool.ruff]
|
|
|
|
select = [
|
|
|
|
"E", # pycodestyle
|
|
|
|
"F", # pyflakes
|
|
|
|
"I", # isort
|
|
|
|
]
|
|
|
|
|
|
|
|
[tool.mypy]
|
|
|
|
ignore_missing_imports = "True"
|
|
|
|
disallow_untyped_defs = "True"
|
|
|
|
exclude = ["notebooks", "examples", "example_data"]
|
|
|
|
|
|
|
|
[tool.coverage.run]
|
|
|
|
omit = [
|
|
|
|
"tests/*",
|
|
|
|
]
|
|
|
|
|
|
|
|
[build-system]
|
|
|
|
requires = ["poetry-core>=1.0.0"]
|
|
|
|
build-backend = "poetry.core.masonry.api"
|
|
|
|
|
|
|
|
[tool.pytest.ini_options]
|
|
|
|
# --strict-markers will raise errors on unknown marks.
|
|
|
|
# https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
|
|
|
|
#
|
|
|
|
# https://docs.pytest.org/en/7.1.x/reference/reference.html
|
|
|
|
# --strict-config any warnings encountered while parsing the `pytest`
|
|
|
|
# section of the configuration file raise errors.
|
|
|
|
#
|
|
|
|
# https://github.com/tophat/syrupy
|
|
|
|
# --snapshot-warn-unused Prints a warning on unused snapshots rather than fail the test suite.
|
|
|
|
addopts = "--strict-markers --strict-config --durations=5"
|
|
|
|
# Registering custom markers.
|
|
|
|
# https://docs.pytest.org/en/7.1.x/example/markers.html#registering-markers
|
|
|
|
markers = [
|
2023-10-13 16:48:24 +00:00
|
|
|
"requires: mark tests as requiring a specific library",
|
2023-10-21 14:52:18 +00:00
|
|
|
"asyncio: mark tests as requiring asyncio",
|
|
|
|
"compile: mark placeholder test used to compile integration tests without running them",
|
2023-07-21 17:36:28 +00:00
|
|
|
]
|