mirror of https://github.com/hwchase17/langchain
voyageai[patch]: init package (#19098)
Co-authored-by: fodizoltan <zoltan@conway.expert> Co-authored-by: Yujie Qian <thomasq0809@gmail.com> Co-authored-by: fzowl <160063452+fzowl@users.noreply.github.com>pull/19050/head
parent
5157b15446
commit
7ce81eb6f4
@ -0,0 +1,24 @@
|
||||
# VoyageAI
|
||||
|
||||
All functionality related to VoyageAI
|
||||
|
||||
>[VoyageAI](https://www.voyageai.com/) Voyage AI builds embedding models, customized for your domain and company, for better retrieval quality.
|
||||
> customized for your domain and company, for better retrieval quality.
|
||||
|
||||
## Installation and Setup
|
||||
|
||||
Install the integration package with
|
||||
```bash
|
||||
pip install langchain-voyageai
|
||||
```
|
||||
|
||||
Get an VoyageAI api key and set it as an environment variable (`VOYAGE_API_KEY`)
|
||||
|
||||
|
||||
## Text Embedding Model
|
||||
|
||||
See a [usage example](/docs/integrations/text_embedding/voyageai)
|
||||
|
||||
```python
|
||||
from langchain_voyageai import VoyageAIEmbeddings
|
||||
```
|
@ -0,0 +1 @@
|
||||
__pycache__
|
@ -0,0 +1,21 @@
|
||||
MIT License
|
||||
|
||||
Copyright (c) 2024 LangChain, Inc.
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
@ -0,0 +1,57 @@
|
||||
.PHONY: all format lint test tests integration_tests docker_tests help extended_tests
|
||||
|
||||
# Default target executed when no arguments are given to make.
|
||||
all: help
|
||||
|
||||
# Define a variable for the test file path.
|
||||
TEST_FILE ?= tests/unit_tests/
|
||||
integration_test integration_tests: TEST_FILE=tests/integration_tests/
|
||||
|
||||
test tests integration_test integration_tests:
|
||||
poetry run pytest $(TEST_FILE)
|
||||
|
||||
|
||||
######################
|
||||
# LINTING AND FORMATTING
|
||||
######################
|
||||
|
||||
# Define a variable for Python and notebook files.
|
||||
PYTHON_FILES=.
|
||||
MYPY_CACHE=.mypy_cache
|
||||
lint format: PYTHON_FILES=.
|
||||
lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=libs/partners/voyageai --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
|
||||
lint_package: PYTHON_FILES=langchain_voyageai
|
||||
lint_tests: PYTHON_FILES=tests
|
||||
lint_tests: MYPY_CACHE=.mypy_cache_test
|
||||
|
||||
lint lint_diff lint_package lint_tests:
|
||||
poetry run ruff .
|
||||
poetry run ruff format $(PYTHON_FILES) --diff
|
||||
poetry run ruff --select I $(PYTHON_FILES)
|
||||
mkdir $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)
|
||||
|
||||
format format_diff:
|
||||
poetry run ruff format $(PYTHON_FILES)
|
||||
poetry run ruff --select I --fix $(PYTHON_FILES)
|
||||
|
||||
spell_check:
|
||||
poetry run codespell --toml pyproject.toml
|
||||
|
||||
spell_fix:
|
||||
poetry run codespell --toml pyproject.toml -w
|
||||
|
||||
check_imports: $(shell find langchain_voyageai -name '*.py')
|
||||
poetry run python ./scripts/check_imports.py $^
|
||||
|
||||
######################
|
||||
# HELP
|
||||
######################
|
||||
|
||||
help:
|
||||
@echo '----'
|
||||
@echo 'check_imports - check imports'
|
||||
@echo 'format - run code formatters'
|
||||
@echo 'lint - run linters'
|
||||
@echo 'test - run unit tests'
|
||||
@echo 'tests - run unit tests'
|
||||
@echo 'test TEST_FILE=<test_file> - run all tests in file'
|
@ -0,0 +1,21 @@
|
||||
# langchain-voyageai
|
||||
|
||||
This package contains the LangChain integrations for VoyageAI through their `voyageai` client package.
|
||||
|
||||
## Installation and Setup
|
||||
|
||||
- Install the LangChain partner package
|
||||
```bash
|
||||
pip install langchain-voyageai
|
||||
```
|
||||
- Get an VoyageAI api key and set it as an environment variable (`VOYAGE_API_KEY`) or use the API key as a parameter in the Client.
|
||||
|
||||
|
||||
|
||||
## Text Embedding Model
|
||||
|
||||
See a [usage example](https://python.langchain.com/docs/integrations/text_embedding/voyageai)
|
||||
|
||||
```python
|
||||
from langchain_voyageai import VoyageAIEmbeddings
|
||||
```
|
@ -0,0 +1,5 @@
|
||||
from langchain_voyageai.embeddings import VoyageAIEmbeddings
|
||||
|
||||
__all__ = [
|
||||
"VoyageAIEmbeddings",
|
||||
]
|
@ -0,0 +1,130 @@
|
||||
import logging
|
||||
import os
|
||||
from typing import Iterable, List, Optional
|
||||
|
||||
import voyageai # type: ignore
|
||||
from langchain_core.embeddings import Embeddings
|
||||
from langchain_core.pydantic_v1 import (
|
||||
BaseModel,
|
||||
Extra,
|
||||
Field,
|
||||
SecretStr,
|
||||
root_validator,
|
||||
)
|
||||
from langchain_core.utils import convert_to_secret_str
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class VoyageAIEmbeddings(BaseModel, Embeddings):
|
||||
"""VoyageAIEmbeddings embedding model.
|
||||
|
||||
Example:
|
||||
.. code-block:: python
|
||||
|
||||
from langchain_voyageai import VoyageAIEmbeddings
|
||||
|
||||
model = VoyageAIEmbeddings()
|
||||
"""
|
||||
|
||||
_client: voyageai.Client = Field(exclude=True)
|
||||
_aclient: voyageai.client_async.AsyncClient = Field(exclude=True)
|
||||
model: str
|
||||
batch_size: int
|
||||
show_progress_bar: bool = False
|
||||
truncation: Optional[bool] = None
|
||||
voyage_api_key: Optional[SecretStr] = None
|
||||
|
||||
class Config:
|
||||
extra = Extra.forbid
|
||||
|
||||
@root_validator(pre=True)
|
||||
def default_values(cls, values: dict) -> dict:
|
||||
"""Set default batch size based on model"""
|
||||
|
||||
model = values.get("model")
|
||||
batch_size = values.get("batch_size")
|
||||
if batch_size is None:
|
||||
print("batch size", batch_size)
|
||||
values["batch_size"] = 72 if model in ["voyage-2", "voyage-02"] else 7
|
||||
return values
|
||||
|
||||
@root_validator()
|
||||
def validate_environment(cls, values: dict) -> dict:
|
||||
"""Validate that VoyageAI credentials exist in environment."""
|
||||
voyage_api_key = values.get("voyage_api_key") or os.getenv(
|
||||
"VOYAGE_API_KEY", None
|
||||
)
|
||||
if voyage_api_key:
|
||||
api_key_secretstr = convert_to_secret_str(voyage_api_key)
|
||||
values["voyage_api_key"] = api_key_secretstr
|
||||
|
||||
api_key_str = api_key_secretstr.get_secret_value()
|
||||
else:
|
||||
api_key_str = None
|
||||
values["_client"] = voyageai.Client(api_key=api_key_str)
|
||||
values["_aclient"] = voyageai.client_async.AsyncClient(api_key=api_key_str)
|
||||
return values
|
||||
|
||||
def _get_batch_iterator(self, texts: List[str]) -> Iterable:
|
||||
if self.show_progress_bar:
|
||||
try:
|
||||
from tqdm.auto import tqdm # type: ignore
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"Must have tqdm installed if `show_progress_bar` is set to True. "
|
||||
"Please install with `pip install tqdm`."
|
||||
) from e
|
||||
|
||||
_iter = tqdm(range(0, len(texts), self.batch_size))
|
||||
else:
|
||||
_iter = range(0, len(texts), self.batch_size) # type: ignore
|
||||
|
||||
return _iter
|
||||
|
||||
def embed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
"""Embed search docs."""
|
||||
embeddings: List[List[float]] = []
|
||||
|
||||
_iter = self._get_batch_iterator(texts)
|
||||
for i in _iter:
|
||||
embeddings.extend(
|
||||
self._client.embed(
|
||||
texts[i : i + self.batch_size],
|
||||
model=self.model,
|
||||
input_type="document",
|
||||
truncation=self.truncation,
|
||||
).embeddings
|
||||
)
|
||||
|
||||
return embeddings
|
||||
|
||||
def embed_query(self, text: str) -> List[float]:
|
||||
"""Embed query text."""
|
||||
return self._client.embed(
|
||||
[text], model=self.model, input_type="query", truncation=self.truncation
|
||||
).embeddings[0]
|
||||
|
||||
async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
|
||||
embeddings: List[List[float]] = []
|
||||
|
||||
_iter = self._get_batch_iterator(texts)
|
||||
for i in _iter:
|
||||
r = await self._aclient.embed(
|
||||
texts[i : i + self.batch_size],
|
||||
model=self.model,
|
||||
input_type="document",
|
||||
truncation=self.truncation,
|
||||
)
|
||||
embeddings.extend(r.embeddings)
|
||||
|
||||
return embeddings
|
||||
|
||||
async def aembed_query(self, text: str) -> List[float]:
|
||||
r = await self._aclient.embed(
|
||||
[text],
|
||||
model=self.model,
|
||||
input_type="query",
|
||||
truncation=self.truncation,
|
||||
)
|
||||
return r.embeddings[0]
|
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,92 @@
|
||||
[tool.poetry]
|
||||
name = "langchain-voyageai"
|
||||
version = "0.1.0"
|
||||
description = "An integration package connecting VoyageAI and LangChain"
|
||||
authors = []
|
||||
readme = "README.md"
|
||||
repository = "https://github.com/langchain-ai/langchain"
|
||||
license = "MIT"
|
||||
|
||||
[tool.poetry.urls]
|
||||
"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/partners/voyageai"
|
||||
|
||||
[tool.poetry.dependencies]
|
||||
python = ">=3.8.1,<4.0"
|
||||
langchain-core = "^0.1.32"
|
||||
voyageai = ">=0.2.1,<1"
|
||||
|
||||
[tool.poetry.group.test]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.test.dependencies]
|
||||
pytest = "^7.3.0"
|
||||
freezegun = "^1.2.2"
|
||||
pytest-mock = "^3.10.0"
|
||||
syrupy = "^4.0.2"
|
||||
pytest-watcher = "^0.3.4"
|
||||
pytest-asyncio = "^0.21.1"
|
||||
langchain-core = { path = "../../core", develop = true }
|
||||
|
||||
[tool.poetry.group.codespell]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.codespell.dependencies]
|
||||
codespell = "^2.2.0"
|
||||
|
||||
[tool.poetry.group.test_integration]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.test_integration.dependencies]
|
||||
|
||||
[tool.poetry.group.lint]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.lint.dependencies]
|
||||
ruff = "^0.1.5"
|
||||
|
||||
[tool.poetry.group.typing.dependencies]
|
||||
mypy = "^0.991"
|
||||
langchain-core = { path = "../../core", develop = true }
|
||||
|
||||
[tool.poetry.group.dev]
|
||||
optional = true
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
langchain-core = { path = "../../core", develop = true }
|
||||
|
||||
[tool.ruff]
|
||||
select = [
|
||||
"E", # pycodestyle
|
||||
"F", # pyflakes
|
||||
"I", # isort
|
||||
]
|
||||
|
||||
[tool.mypy]
|
||||
disallow_untyped_defs = "True"
|
||||
|
||||
[tool.coverage.run]
|
||||
omit = ["tests/*"]
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core>=1.0.0"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
# --strict-markers will raise errors on unknown marks.
|
||||
# https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
|
||||
#
|
||||
# https://docs.pytest.org/en/7.1.x/reference/reference.html
|
||||
# --strict-config any warnings encountered while parsing the `pytest`
|
||||
# section of the configuration file raise errors.
|
||||
#
|
||||
# https://github.com/tophat/syrupy
|
||||
# --snapshot-warn-unused Prints a warning on unused snapshots rather than fail the test suite.
|
||||
addopts = "--strict-markers --strict-config --durations=5"
|
||||
# Registering custom markers.
|
||||
# https://docs.pytest.org/en/7.1.x/example/markers.html#registering-markers
|
||||
markers = [
|
||||
"requires: mark tests as requiring a specific library",
|
||||
"asyncio: mark tests as requiring asyncio",
|
||||
"compile: mark placeholder test used to compile integration tests without running them",
|
||||
]
|
||||
asyncio_mode = "auto"
|
@ -0,0 +1,17 @@
|
||||
import sys
|
||||
import traceback
|
||||
from importlib.machinery import SourceFileLoader
|
||||
|
||||
if __name__ == "__main__":
|
||||
files = sys.argv[1:]
|
||||
has_failure = False
|
||||
for file in files:
|
||||
try:
|
||||
SourceFileLoader("x", file).load_module()
|
||||
except Exception:
|
||||
has_faillure = True
|
||||
print(file)
|
||||
traceback.print_exc()
|
||||
print()
|
||||
|
||||
sys.exit(1 if has_failure else 0)
|
@ -0,0 +1,27 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# This script searches for lines starting with "import pydantic" or "from pydantic"
|
||||
# in tracked files within a Git repository.
|
||||
#
|
||||
# Usage: ./scripts/check_pydantic.sh /path/to/repository
|
||||
|
||||
# Check if a path argument is provided
|
||||
if [ $# -ne 1 ]; then
|
||||
echo "Usage: $0 /path/to/repository"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
repository_path="$1"
|
||||
|
||||
# Search for lines matching the pattern within the specified repository
|
||||
result=$(git -C "$repository_path" grep -E '^import pydantic|^from pydantic')
|
||||
|
||||
# Check if any matching lines were found
|
||||
if [ -n "$result" ]; then
|
||||
echo "ERROR: The following lines need to be updated:"
|
||||
echo "$result"
|
||||
echo "Please replace the code with an import from langchain_core.pydantic_v1."
|
||||
echo "For example, replace 'from pydantic import BaseModel'"
|
||||
echo "with 'from langchain_core.pydantic_v1 import BaseModel'"
|
||||
exit 1
|
||||
fi
|
@ -0,0 +1,17 @@
|
||||
#!/bin/bash
|
||||
|
||||
set -eu
|
||||
|
||||
# Initialize a variable to keep track of errors
|
||||
errors=0
|
||||
|
||||
# make sure not importing from langchain or langchain_experimental
|
||||
git --no-pager grep '^from langchain\.' . && errors=$((errors+1))
|
||||
git --no-pager grep '^from langchain_experimental\.' . && errors=$((errors+1))
|
||||
|
||||
# Decide on an exit status based on the errors
|
||||
if [ "$errors" -gt 0 ]; then
|
||||
exit 1
|
||||
else
|
||||
exit 0
|
||||
fi
|
@ -0,0 +1,7 @@
|
||||
import pytest
|
||||
|
||||
|
||||
@pytest.mark.compile
|
||||
def test_placeholder() -> None:
|
||||
"""Used for compiling integration tests without running any real tests."""
|
||||
pass
|
@ -0,0 +1,53 @@
|
||||
"""Test VoyageAI embeddings."""
|
||||
|
||||
from langchain_voyageai import VoyageAIEmbeddings
|
||||
|
||||
# Please set VOYAGE_API_KEY in the environment variables
|
||||
MODEL = "voyage-2"
|
||||
|
||||
|
||||
def test_langchain_voyageai_embedding_documents() -> None:
|
||||
"""Test voyage embeddings."""
|
||||
documents = ["foo bar"]
|
||||
embedding = VoyageAIEmbeddings(model=MODEL)
|
||||
output = embedding.embed_documents(documents)
|
||||
assert len(output) == 1
|
||||
assert len(output[0]) == 1024
|
||||
|
||||
|
||||
def test_langchain_voyageai_embedding_documents_multiple() -> None:
|
||||
"""Test voyage embeddings."""
|
||||
documents = ["foo bar", "bar foo", "foo"]
|
||||
embedding = VoyageAIEmbeddings(model=MODEL, batch_size=2)
|
||||
output = embedding.embed_documents(documents)
|
||||
assert len(output) == 3
|
||||
assert len(output[0]) == 1024
|
||||
assert len(output[1]) == 1024
|
||||
assert len(output[2]) == 1024
|
||||
|
||||
|
||||
def test_langchain_voyageai_embedding_query() -> None:
|
||||
"""Test voyage embeddings."""
|
||||
document = "foo bar"
|
||||
embedding = VoyageAIEmbeddings(model=MODEL)
|
||||
output = embedding.embed_query(document)
|
||||
assert len(output) == 1024
|
||||
|
||||
|
||||
async def test_langchain_voyageai_async_embedding_documents_multiple() -> None:
|
||||
"""Test voyage embeddings."""
|
||||
documents = ["foo bar", "bar foo", "foo"]
|
||||
embedding = VoyageAIEmbeddings(model=MODEL, batch_size=2)
|
||||
output = await embedding.aembed_documents(documents)
|
||||
assert len(output) == 3
|
||||
assert len(output[0]) == 1024
|
||||
assert len(output[1]) == 1024
|
||||
assert len(output[2]) == 1024
|
||||
|
||||
|
||||
async def test_langchain_voyageai_async_embedding_query() -> None:
|
||||
"""Test voyage embeddings."""
|
||||
document = "foo bar"
|
||||
embedding = VoyageAIEmbeddings(model=MODEL)
|
||||
output = await embedding.aembed_query(document)
|
||||
assert len(output) == 1024
|
@ -0,0 +1,36 @@
|
||||
"""Test embedding model integration."""
|
||||
|
||||
from langchain_core.embeddings import Embeddings
|
||||
|
||||
from langchain_voyageai import VoyageAIEmbeddings
|
||||
|
||||
MODEL = "voyage-2"
|
||||
|
||||
|
||||
def test_initialization_voyage_2() -> None:
|
||||
"""Test embedding model initialization."""
|
||||
emb = VoyageAIEmbeddings(voyage_api_key="NOT_A_VALID_KEY", model=MODEL)
|
||||
assert isinstance(emb, Embeddings)
|
||||
assert emb.batch_size == 72
|
||||
assert emb.model == MODEL
|
||||
assert emb._client is not None
|
||||
|
||||
|
||||
def test_initialization_voyage_1() -> None:
|
||||
"""Test embedding model initialization."""
|
||||
emb = VoyageAIEmbeddings(voyage_api_key="NOT_A_VALID_KEY", model="voyage-01")
|
||||
assert isinstance(emb, Embeddings)
|
||||
assert emb.batch_size == 7
|
||||
assert emb.model == "voyage-01"
|
||||
assert emb._client is not None
|
||||
|
||||
|
||||
def test_initialization_voyage_1_batch_size() -> None:
|
||||
"""Test embedding model initialization."""
|
||||
emb = VoyageAIEmbeddings(
|
||||
voyage_api_key="NOT_A_VALID_KEY", model="voyage-01", batch_size=15
|
||||
)
|
||||
assert isinstance(emb, Embeddings)
|
||||
assert emb.batch_size == 15
|
||||
assert emb.model == "voyage-01"
|
||||
assert emb._client is not None
|
@ -0,0 +1,9 @@
|
||||
from langchain_voyageai import __all__
|
||||
|
||||
EXPECTED_ALL = [
|
||||
"VoyageAIEmbeddings",
|
||||
]
|
||||
|
||||
|
||||
def test_all_imports() -> None:
|
||||
assert sorted(EXPECTED_ALL) == sorted(__all__)
|
Loading…
Reference in New Issue