2023-07-21 17:36:28 +00:00
|
|
|
---
|
2023-08-08 21:55:25 +00:00
|
|
|
name: libs/experimental CI
|
2023-07-21 17:36:28 +00:00
|
|
|
|
|
|
|
on:
|
|
|
|
push:
|
|
|
|
branches: [ master ]
|
|
|
|
pull_request:
|
|
|
|
paths:
|
2023-09-05 17:08:19 +00:00
|
|
|
- '.github/actions/poetry_setup/action.yml'
|
|
|
|
- '.github/tools/**'
|
2023-07-21 17:36:28 +00:00
|
|
|
- '.github/workflows/_lint.yml'
|
|
|
|
- '.github/workflows/_test.yml'
|
|
|
|
- '.github/workflows/langchain_experimental_ci.yml'
|
2023-10-26 22:31:20 +00:00
|
|
|
- 'libs/*'
|
|
|
|
- 'libs/experimental/**'
|
2023-07-21 17:36:28 +00:00
|
|
|
workflow_dispatch: # Allows to trigger the workflow manually in GitHub UI
|
|
|
|
|
2023-08-22 18:21:26 +00:00
|
|
|
# If another push to the same PR or branch happens while this workflow is still running,
|
|
|
|
# cancel the earlier run in favor of the next run.
|
|
|
|
#
|
|
|
|
# There's no point in testing an outdated version of the code. GitHub only allows
|
|
|
|
# a limited number of job runners to be active at the same time, so it's better to cancel
|
|
|
|
# pointless jobs early so that more useful jobs can run sooner.
|
|
|
|
concurrency:
|
|
|
|
group: ${{ github.workflow }}-${{ github.ref }}
|
|
|
|
cancel-in-progress: true
|
|
|
|
|
2023-08-22 17:35:21 +00:00
|
|
|
env:
|
2023-10-03 23:23:54 +00:00
|
|
|
POETRY_VERSION: "1.6.1"
|
2023-08-22 17:35:21 +00:00
|
|
|
WORKDIR: "libs/experimental"
|
|
|
|
|
2023-07-21 17:36:28 +00:00
|
|
|
jobs:
|
|
|
|
lint:
|
|
|
|
uses:
|
|
|
|
./.github/workflows/_lint.yml
|
|
|
|
with:
|
|
|
|
working-directory: libs/experimental
|
|
|
|
secrets: inherit
|
2023-08-22 17:35:21 +00:00
|
|
|
|
2023-07-21 17:36:28 +00:00
|
|
|
test:
|
|
|
|
uses:
|
|
|
|
./.github/workflows/_test.yml
|
|
|
|
with:
|
|
|
|
working-directory: libs/experimental
|
2023-08-22 15:36:52 +00:00
|
|
|
secrets: inherit
|
2023-08-22 17:35:21 +00:00
|
|
|
|
2023-10-24 15:55:19 +00:00
|
|
|
compile-integration-tests:
|
|
|
|
uses:
|
|
|
|
./.github/workflows/_compile_integration_test.yml
|
|
|
|
with:
|
|
|
|
working-directory: libs/experimental
|
|
|
|
secrets: inherit
|
|
|
|
|
2023-08-22 17:35:21 +00:00
|
|
|
# It's possible that langchain-experimental works fine with the latest *published* langchain,
|
|
|
|
# but is broken with the langchain on `master`.
|
|
|
|
#
|
|
|
|
# We want to catch situations like that *before* releasing a new langchain, hence this test.
|
|
|
|
test-with-latest-langchain:
|
|
|
|
runs-on: ubuntu-latest
|
|
|
|
defaults:
|
|
|
|
run:
|
|
|
|
working-directory: ${{ env.WORKDIR }}
|
|
|
|
strategy:
|
|
|
|
matrix:
|
|
|
|
python-version:
|
|
|
|
- "3.8"
|
|
|
|
- "3.9"
|
|
|
|
- "3.10"
|
|
|
|
- "3.11"
|
|
|
|
name: test with unpublished langchain - Python ${{ matrix.python-version }}
|
|
|
|
steps:
|
2023-10-23 14:01:33 +00:00
|
|
|
- uses: actions/checkout@v4
|
2023-08-22 19:59:22 +00:00
|
|
|
|
|
|
|
- name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }}
|
2023-08-22 17:35:21 +00:00
|
|
|
uses: "./.github/actions/poetry_setup"
|
|
|
|
with:
|
|
|
|
python-version: ${{ matrix.python-version }}
|
|
|
|
poetry-version: ${{ env.POETRY_VERSION }}
|
2023-08-22 19:59:22 +00:00
|
|
|
working-directory: ${{ env.WORKDIR }}
|
2023-08-22 17:35:21 +00:00
|
|
|
cache-key: unpublished-langchain
|
|
|
|
|
2023-08-22 19:59:22 +00:00
|
|
|
- name: Install dependencies
|
|
|
|
shell: bash
|
|
|
|
run: |
|
|
|
|
echo "Running tests with unpublished langchain, installing dependencies with poetry..."
|
|
|
|
poetry install
|
|
|
|
|
|
|
|
echo "Editably installing langchain outside of poetry, to avoid messing up lockfile..."
|
|
|
|
poetry run pip install -e ../langchain
|
|
|
|
|
2023-08-22 17:35:21 +00:00
|
|
|
- name: Run tests
|
|
|
|
run: make test
|
Add data anonymizer (#9863)
### Description
The feature for anonymizing data has been implemented. In order to
protect private data, such as when querying external APIs (OpenAI), it
is worth pseudonymizing sensitive data to maintain full privacy.
Anonynization consists of two steps:
1. **Identification:** Identify all data fields that contain personally
identifiable information (PII).
2. **Replacement**: Replace all PIIs with pseudo values or codes that do
not reveal any personal information about the individual but can be used
for reference. We're not using regular encryption, because the language
model won't be able to understand the meaning or context of the
encrypted data.
We use *Microsoft Presidio* together with *Faker* framework for
anonymization purposes because of the wide range of functionalities they
provide. The full implementation is available in `PresidioAnonymizer`.
### Future works
- **deanonymization** - add the ability to reverse anonymization. For
example, the workflow could look like this: `anonymize -> LLMChain ->
deanonymize`. By doing this, we will retain anonymity in requests to,
for example, OpenAI, and then be able restore the original data.
- **instance anonymization** - at this point, each occurrence of PII is
treated as a separate entity and separately anonymized. Therefore, two
occurrences of the name John Doe in the text will be changed to two
different names. It is therefore worth introducing support for full
instance detection, so that repeated occurrences are treated as a single
object.
### Twitter handle
@deepsense_ai / @MaksOpp
---------
Co-authored-by: MaksOpp <maks.operlejn@gmail.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
2023-08-30 17:39:44 +00:00
|
|
|
extended-tests:
|
|
|
|
runs-on: ubuntu-latest
|
|
|
|
defaults:
|
|
|
|
run:
|
|
|
|
working-directory: ${{ env.WORKDIR }}
|
|
|
|
strategy:
|
|
|
|
matrix:
|
|
|
|
python-version:
|
|
|
|
- "3.8"
|
|
|
|
- "3.9"
|
|
|
|
- "3.10"
|
|
|
|
- "3.11"
|
|
|
|
name: Python ${{ matrix.python-version }} extended tests
|
|
|
|
steps:
|
2023-10-23 14:01:33 +00:00
|
|
|
- uses: actions/checkout@v4
|
Add data anonymizer (#9863)
### Description
The feature for anonymizing data has been implemented. In order to
protect private data, such as when querying external APIs (OpenAI), it
is worth pseudonymizing sensitive data to maintain full privacy.
Anonynization consists of two steps:
1. **Identification:** Identify all data fields that contain personally
identifiable information (PII).
2. **Replacement**: Replace all PIIs with pseudo values or codes that do
not reveal any personal information about the individual but can be used
for reference. We're not using regular encryption, because the language
model won't be able to understand the meaning or context of the
encrypted data.
We use *Microsoft Presidio* together with *Faker* framework for
anonymization purposes because of the wide range of functionalities they
provide. The full implementation is available in `PresidioAnonymizer`.
### Future works
- **deanonymization** - add the ability to reverse anonymization. For
example, the workflow could look like this: `anonymize -> LLMChain ->
deanonymize`. By doing this, we will retain anonymity in requests to,
for example, OpenAI, and then be able restore the original data.
- **instance anonymization** - at this point, each occurrence of PII is
treated as a separate entity and separately anonymized. Therefore, two
occurrences of the name John Doe in the text will be changed to two
different names. It is therefore worth introducing support for full
instance detection, so that repeated occurrences are treated as a single
object.
### Twitter handle
@deepsense_ai / @MaksOpp
---------
Co-authored-by: MaksOpp <maks.operlejn@gmail.com>
Co-authored-by: Bagatur <baskaryan@gmail.com>
2023-08-30 17:39:44 +00:00
|
|
|
|
|
|
|
- name: Set up Python ${{ matrix.python-version }} + Poetry ${{ env.POETRY_VERSION }}
|
|
|
|
uses: "./.github/actions/poetry_setup"
|
|
|
|
with:
|
|
|
|
python-version: ${{ matrix.python-version }}
|
|
|
|
poetry-version: ${{ env.POETRY_VERSION }}
|
|
|
|
working-directory: libs/experimental
|
|
|
|
cache-key: extended
|
|
|
|
|
|
|
|
- name: Install dependencies
|
|
|
|
shell: bash
|
|
|
|
run: |
|
|
|
|
echo "Running extended tests, installing dependencies with poetry..."
|
|
|
|
poetry install -E extended_testing
|
|
|
|
|
|
|
|
- name: Run extended tests
|
|
|
|
run: make extended_tests
|
2023-09-06 15:15:16 +00:00
|
|
|
|
|
|
|
- name: Ensure the tests did not create any additional files
|
|
|
|
shell: bash
|
|
|
|
run: |
|
|
|
|
set -eu
|
|
|
|
|
|
|
|
STATUS="$(git status)"
|
|
|
|
echo "$STATUS"
|
|
|
|
|
|
|
|
# grep will exit non-zero if the target message isn't found,
|
|
|
|
# and `set -e` above will cause the step to fail.
|
|
|
|
echo "$STATUS" | grep 'nothing to commit, working tree clean'
|