voyageai[patch]: init package (#19098)

Co-authored-by: fodizoltan <zoltan@conway.expert>
Co-authored-by: Yujie Qian <thomasq0809@gmail.com>
Co-authored-by: fzowl <160063452+fzowl@users.noreply.github.com>
pull/19050/head
Erick Friis 3 months ago committed by GitHub
parent 5157b15446
commit 7ce81eb6f4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -75,6 +75,7 @@ jobs:
ES_API_KEY: ${{ secrets.ES_API_KEY }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # for airbyte
MONGODB_ATLAS_URI: ${{ secrets.MONGODB_ATLAS_URI }}
VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }}
run: |
make integration_tests

@ -196,6 +196,7 @@ jobs:
ES_API_KEY: ${{ secrets.ES_API_KEY }}
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # for airbyte
MONGODB_ATLAS_URI: ${{ secrets.MONGODB_ATLAS_URI }}
VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }}
run: make integration_tests
working-directory: ${{ inputs.working-directory }}

@ -0,0 +1,24 @@
# VoyageAI
All functionality related to VoyageAI
>[VoyageAI](https://www.voyageai.com/) Voyage AI builds embedding models, customized for your domain and company, for better retrieval quality.
> customized for your domain and company, for better retrieval quality.
## Installation and Setup
Install the integration package with
```bash
pip install langchain-voyageai
```
Get an VoyageAI api key and set it as an environment variable (`VOYAGE_API_KEY`)
## Text Embedding Model
See a [usage example](/docs/integrations/text_embedding/voyageai)
```python
from langchain_voyageai import VoyageAIEmbeddings
```

@ -9,7 +9,7 @@
"\n",
">[Voyage AI](https://www.voyageai.com/) provides cutting-edge embedding/vectorizations models.\n",
"\n",
"Let's load the Voyage Embedding class."
"Let's load the Voyage Embedding class. (Install the LangChain partner package with `pip install langchain-voyageai`)"
]
},
{
@ -19,7 +19,7 @@
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.embeddings import VoyageEmbeddings"
"from langchain_voyageai import VoyageAIEmbeddings"
]
},
{
@ -37,7 +37,7 @@
"metadata": {},
"outputs": [],
"source": [
"embeddings = VoyageEmbeddings(\n",
"embeddings = VoyageAIEmbeddings(\n",
" voyage_api_key=\"[ Your Voyage API key ]\", model=\"voyage-2\"\n",
")"
]

@ -14,6 +14,7 @@ from typing import (
)
import requests
from langchain_core._api.deprecation import deprecated
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import BaseModel, Extra, SecretStr, root_validator
from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env
@ -58,6 +59,11 @@ def embed_with_retry(embeddings: VoyageEmbeddings, **kwargs: Any) -> Any:
return _embed_with_retry(**kwargs)
@deprecated(
since="0.0.29",
removal="0.2",
alternative_import="langchain_voyageai.VoyageAIEmbeddings",
)
class VoyageEmbeddings(BaseModel, Embeddings):
"""Voyage embedding models.

@ -0,0 +1 @@
__pycache__

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 LangChain, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

@ -0,0 +1,57 @@
.PHONY: all format lint test tests integration_tests docker_tests help extended_tests
# Default target executed when no arguments are given to make.
all: help
# Define a variable for the test file path.
TEST_FILE ?= tests/unit_tests/
integration_test integration_tests: TEST_FILE=tests/integration_tests/
test tests integration_test integration_tests:
poetry run pytest $(TEST_FILE)
######################
# LINTING AND FORMATTING
######################
# Define a variable for Python and notebook files.
PYTHON_FILES=.
MYPY_CACHE=.mypy_cache
lint format: PYTHON_FILES=.
lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=libs/partners/voyageai --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
lint_package: PYTHON_FILES=langchain_voyageai
lint_tests: PYTHON_FILES=tests
lint_tests: MYPY_CACHE=.mypy_cache_test
lint lint_diff lint_package lint_tests:
poetry run ruff .
poetry run ruff format $(PYTHON_FILES) --diff
poetry run ruff --select I $(PYTHON_FILES)
mkdir $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)
format format_diff:
poetry run ruff format $(PYTHON_FILES)
poetry run ruff --select I --fix $(PYTHON_FILES)
spell_check:
poetry run codespell --toml pyproject.toml
spell_fix:
poetry run codespell --toml pyproject.toml -w
check_imports: $(shell find langchain_voyageai -name '*.py')
poetry run python ./scripts/check_imports.py $^
######################
# HELP
######################
help:
@echo '----'
@echo 'check_imports - check imports'
@echo 'format - run code formatters'
@echo 'lint - run linters'
@echo 'test - run unit tests'
@echo 'tests - run unit tests'
@echo 'test TEST_FILE=<test_file> - run all tests in file'

@ -0,0 +1,21 @@
# langchain-voyageai
This package contains the LangChain integrations for VoyageAI through their `voyageai` client package.
## Installation and Setup
- Install the LangChain partner package
```bash
pip install langchain-voyageai
```
- Get an VoyageAI api key and set it as an environment variable (`VOYAGE_API_KEY`) or use the API key as a parameter in the Client.
## Text Embedding Model
See a [usage example](https://python.langchain.com/docs/integrations/text_embedding/voyageai)
```python
from langchain_voyageai import VoyageAIEmbeddings
```

@ -0,0 +1,5 @@
from langchain_voyageai.embeddings import VoyageAIEmbeddings
__all__ = [
"VoyageAIEmbeddings",
]

@ -0,0 +1,130 @@
import logging
import os
from typing import Iterable, List, Optional
import voyageai # type: ignore
from langchain_core.embeddings import Embeddings
from langchain_core.pydantic_v1 import (
BaseModel,
Extra,
Field,
SecretStr,
root_validator,
)
from langchain_core.utils import convert_to_secret_str
logger = logging.getLogger(__name__)
class VoyageAIEmbeddings(BaseModel, Embeddings):
"""VoyageAIEmbeddings embedding model.
Example:
.. code-block:: python
from langchain_voyageai import VoyageAIEmbeddings
model = VoyageAIEmbeddings()
"""
_client: voyageai.Client = Field(exclude=True)
_aclient: voyageai.client_async.AsyncClient = Field(exclude=True)
model: str
batch_size: int
show_progress_bar: bool = False
truncation: Optional[bool] = None
voyage_api_key: Optional[SecretStr] = None
class Config:
extra = Extra.forbid
@root_validator(pre=True)
def default_values(cls, values: dict) -> dict:
"""Set default batch size based on model"""
model = values.get("model")
batch_size = values.get("batch_size")
if batch_size is None:
print("batch size", batch_size)
values["batch_size"] = 72 if model in ["voyage-2", "voyage-02"] else 7
return values
@root_validator()
def validate_environment(cls, values: dict) -> dict:
"""Validate that VoyageAI credentials exist in environment."""
voyage_api_key = values.get("voyage_api_key") or os.getenv(
"VOYAGE_API_KEY", None
)
if voyage_api_key:
api_key_secretstr = convert_to_secret_str(voyage_api_key)
values["voyage_api_key"] = api_key_secretstr
api_key_str = api_key_secretstr.get_secret_value()
else:
api_key_str = None
values["_client"] = voyageai.Client(api_key=api_key_str)
values["_aclient"] = voyageai.client_async.AsyncClient(api_key=api_key_str)
return values
def _get_batch_iterator(self, texts: List[str]) -> Iterable:
if self.show_progress_bar:
try:
from tqdm.auto import tqdm # type: ignore
except ImportError as e:
raise ImportError(
"Must have tqdm installed if `show_progress_bar` is set to True. "
"Please install with `pip install tqdm`."
) from e
_iter = tqdm(range(0, len(texts), self.batch_size))
else:
_iter = range(0, len(texts), self.batch_size) # type: ignore
return _iter
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Embed search docs."""
embeddings: List[List[float]] = []
_iter = self._get_batch_iterator(texts)
for i in _iter:
embeddings.extend(
self._client.embed(
texts[i : i + self.batch_size],
model=self.model,
input_type="document",
truncation=self.truncation,
).embeddings
)
return embeddings
def embed_query(self, text: str) -> List[float]:
"""Embed query text."""
return self._client.embed(
[text], model=self.model, input_type="query", truncation=self.truncation
).embeddings[0]
async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
embeddings: List[List[float]] = []
_iter = self._get_batch_iterator(texts)
for i in _iter:
r = await self._aclient.embed(
texts[i : i + self.batch_size],
model=self.model,
input_type="document",
truncation=self.truncation,
)
embeddings.extend(r.embeddings)
return embeddings
async def aembed_query(self, text: str) -> List[float]:
r = await self._aclient.embed(
[text],
model=self.model,
input_type="query",
truncation=self.truncation,
)
return r.embeddings[0]

File diff suppressed because it is too large Load Diff

@ -0,0 +1,92 @@
[tool.poetry]
name = "langchain-voyageai"
version = "0.1.0"
description = "An integration package connecting VoyageAI and LangChain"
authors = []
readme = "README.md"
repository = "https://github.com/langchain-ai/langchain"
license = "MIT"
[tool.poetry.urls]
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/partners/voyageai"
[tool.poetry.dependencies]
python = ">=3.8.1,<4.0"
langchain-core = "^0.1.32"
voyageai = ">=0.2.1,<1"
[tool.poetry.group.test]
optional = true
[tool.poetry.group.test.dependencies]
pytest = "^7.3.0"
freezegun = "^1.2.2"
pytest-mock = "^3.10.0"
syrupy = "^4.0.2"
pytest-watcher = "^0.3.4"
pytest-asyncio = "^0.21.1"
langchain-core = { path = "../../core", develop = true }
[tool.poetry.group.codespell]
optional = true
[tool.poetry.group.codespell.dependencies]
codespell = "^2.2.0"
[tool.poetry.group.test_integration]
optional = true
[tool.poetry.group.test_integration.dependencies]
[tool.poetry.group.lint]
optional = true
[tool.poetry.group.lint.dependencies]
ruff = "^0.1.5"
[tool.poetry.group.typing.dependencies]
mypy = "^0.991"
langchain-core = { path = "../../core", develop = true }
[tool.poetry.group.dev]
optional = true
[tool.poetry.group.dev.dependencies]
langchain-core = { path = "../../core", develop = true }
[tool.ruff]
select = [
"E", # pycodestyle
"F", # pyflakes
"I", # isort
]
[tool.mypy]
disallow_untyped_defs = "True"
[tool.coverage.run]
omit = ["tests/*"]
[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"
[tool.pytest.ini_options]
# --strict-markers will raise errors on unknown marks.
# https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
#
# https://docs.pytest.org/en/7.1.x/reference/reference.html
# --strict-config any warnings encountered while parsing the `pytest`
# section of the configuration file raise errors.
#
# https://github.com/tophat/syrupy
# --snapshot-warn-unused Prints a warning on unused snapshots rather than fail the test suite.
addopts = "--strict-markers --strict-config --durations=5"
# Registering custom markers.
# https://docs.pytest.org/en/7.1.x/example/markers.html#registering-markers
markers = [
"requires: mark tests as requiring a specific library",
"asyncio: mark tests as requiring asyncio",
"compile: mark placeholder test used to compile integration tests without running them",
]
asyncio_mode = "auto"

@ -0,0 +1,17 @@
import sys
import traceback
from importlib.machinery import SourceFileLoader
if __name__ == "__main__":
files = sys.argv[1:]
has_failure = False
for file in files:
try:
SourceFileLoader("x", file).load_module()
except Exception:
has_faillure = True
print(file)
traceback.print_exc()
print()
sys.exit(1 if has_failure else 0)

@ -0,0 +1,27 @@
#!/bin/bash
#
# This script searches for lines starting with "import pydantic" or "from pydantic"
# in tracked files within a Git repository.
#
# Usage: ./scripts/check_pydantic.sh /path/to/repository
# Check if a path argument is provided
if [ $# -ne 1 ]; then
echo "Usage: $0 /path/to/repository"
exit 1
fi
repository_path="$1"
# Search for lines matching the pattern within the specified repository
result=$(git -C "$repository_path" grep -E '^import pydantic|^from pydantic')
# Check if any matching lines were found
if [ -n "$result" ]; then
echo "ERROR: The following lines need to be updated:"
echo "$result"
echo "Please replace the code with an import from langchain_core.pydantic_v1."
echo "For example, replace 'from pydantic import BaseModel'"
echo "with 'from langchain_core.pydantic_v1 import BaseModel'"
exit 1
fi

@ -0,0 +1,17 @@
#!/bin/bash
set -eu
# Initialize a variable to keep track of errors
errors=0
# make sure not importing from langchain or langchain_experimental
git --no-pager grep '^from langchain\.' . && errors=$((errors+1))
git --no-pager grep '^from langchain_experimental\.' . && errors=$((errors+1))
# Decide on an exit status based on the errors
if [ "$errors" -gt 0 ]; then
exit 1
else
exit 0
fi

@ -0,0 +1,7 @@
import pytest
@pytest.mark.compile
def test_placeholder() -> None:
"""Used for compiling integration tests without running any real tests."""
pass

@ -0,0 +1,53 @@
"""Test VoyageAI embeddings."""
from langchain_voyageai import VoyageAIEmbeddings
# Please set VOYAGE_API_KEY in the environment variables
MODEL = "voyage-2"
def test_langchain_voyageai_embedding_documents() -> None:
"""Test voyage embeddings."""
documents = ["foo bar"]
embedding = VoyageAIEmbeddings(model=MODEL)
output = embedding.embed_documents(documents)
assert len(output) == 1
assert len(output[0]) == 1024
def test_langchain_voyageai_embedding_documents_multiple() -> None:
"""Test voyage embeddings."""
documents = ["foo bar", "bar foo", "foo"]
embedding = VoyageAIEmbeddings(model=MODEL, batch_size=2)
output = embedding.embed_documents(documents)
assert len(output) == 3
assert len(output[0]) == 1024
assert len(output[1]) == 1024
assert len(output[2]) == 1024
def test_langchain_voyageai_embedding_query() -> None:
"""Test voyage embeddings."""
document = "foo bar"
embedding = VoyageAIEmbeddings(model=MODEL)
output = embedding.embed_query(document)
assert len(output) == 1024
async def test_langchain_voyageai_async_embedding_documents_multiple() -> None:
"""Test voyage embeddings."""
documents = ["foo bar", "bar foo", "foo"]
embedding = VoyageAIEmbeddings(model=MODEL, batch_size=2)
output = await embedding.aembed_documents(documents)
assert len(output) == 3
assert len(output[0]) == 1024
assert len(output[1]) == 1024
assert len(output[2]) == 1024
async def test_langchain_voyageai_async_embedding_query() -> None:
"""Test voyage embeddings."""
document = "foo bar"
embedding = VoyageAIEmbeddings(model=MODEL)
output = await embedding.aembed_query(document)
assert len(output) == 1024

@ -0,0 +1,36 @@
"""Test embedding model integration."""
from langchain_core.embeddings import Embeddings
from langchain_voyageai import VoyageAIEmbeddings
MODEL = "voyage-2"
def test_initialization_voyage_2() -> None:
"""Test embedding model initialization."""
emb = VoyageAIEmbeddings(voyage_api_key="NOT_A_VALID_KEY", model=MODEL)
assert isinstance(emb, Embeddings)
assert emb.batch_size == 72
assert emb.model == MODEL
assert emb._client is not None
def test_initialization_voyage_1() -> None:
"""Test embedding model initialization."""
emb = VoyageAIEmbeddings(voyage_api_key="NOT_A_VALID_KEY", model="voyage-01")
assert isinstance(emb, Embeddings)
assert emb.batch_size == 7
assert emb.model == "voyage-01"
assert emb._client is not None
def test_initialization_voyage_1_batch_size() -> None:
"""Test embedding model initialization."""
emb = VoyageAIEmbeddings(
voyage_api_key="NOT_A_VALID_KEY", model="voyage-01", batch_size=15
)
assert isinstance(emb, Embeddings)
assert emb.batch_size == 15
assert emb.model == "voyage-01"
assert emb._client is not None

@ -0,0 +1,9 @@
from langchain_voyageai import __all__
EXPECTED_ALL = [
"VoyageAIEmbeddings",
]
def test_all_imports() -> None:
assert sorted(EXPECTED_ALL) == sorted(__all__)
Loading…
Cancel
Save