voyageai[patch]: init package (#19098)

Co-authored-by: fodizoltan <zoltan@conway.expert> Co-authored-by: Yujie Qian <thomasq0809@gmail.com> Co-authored-by: fzowl <160063452+fzowl@users.noreply.github.com>
3 months ago · 7ce81eb6f4
parent 5157b15446
commit 7ce81eb6f4
24 changed files with 1898 additions and 3 deletions
--- a/.github/workflows/_integration_test.yml
+++ b/.github/workflows/_integration_test.yml
@ -75,6 +75,7 @@ jobs:
          ES_API_KEY: ${{ secrets.ES_API_KEY }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # for airbyte
          MONGODB_ATLAS_URI: ${{ secrets.MONGODB_ATLAS_URI }}
+          VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }}
        run: |
          make integration_tests

--- a/.github/workflows/_release.yml
+++ b/.github/workflows/_release.yml
@ -196,6 +196,7 @@ jobs:
          ES_API_KEY: ${{ secrets.ES_API_KEY }}
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} # for airbyte
          MONGODB_ATLAS_URI: ${{ secrets.MONGODB_ATLAS_URI }}
+          VOYAGE_API_KEY: ${{ secrets.VOYAGE_API_KEY }}
        run: make integration_tests
        working-directory: ${{ inputs.working-directory }}

--- a/docs/docs/integrations/platforms/voyageai.mdx
+++ b/docs/docs/integrations/platforms/voyageai.mdx
@ -0,0 +1,24 @@
+# VoyageAI
+
+All functionality related to VoyageAI
+
+>[VoyageAI](https://www.voyageai.com/) Voyage AI builds embedding models, customized for your domain and company, for better retrieval quality.
+> customized for your domain and company, for better retrieval quality.
+
+## Installation and Setup
+
+Install the integration package with
+```bash
+pip install langchain-voyageai
+```
+
+Get an VoyageAI api key and set it as an environment variable (`VOYAGE_API_KEY`)
+
+
+## Text Embedding Model
+
+See a [usage example](/docs/integrations/text_embedding/voyageai)
+
+```python
+from langchain_voyageai import VoyageAIEmbeddings
+```
--- a/docs/docs/integrations/text_embedding/voyageai.ipynb
+++ b/docs/docs/integrations/text_embedding/voyageai.ipynb
@ -9,7 +9,7 @@
    "\n",
    ">[Voyage AI](https://www.voyageai.com/) provides cutting-edge embedding/vectorizations models.\n",
    "\n",
-    "Let's load the Voyage Embedding class."
+    "Let's load the Voyage Embedding class. (Install the LangChain partner package with `pip install langchain-voyageai`)"
   ]
  },
  {
@ -19,7 +19,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "from langchain_community.embeddings import VoyageEmbeddings"
+    "from langchain_voyageai import VoyageAIEmbeddings"
   ]
  },
  {
@ -37,7 +37,7 @@
   "metadata": {},
   "outputs": [],
   "source": [
-    "embeddings = VoyageEmbeddings(\n",
+    "embeddings = VoyageAIEmbeddings(\n",
    "    voyage_api_key=\"[ Your Voyage API key ]\", model=\"voyage-2\"\n",
    ")"
   ]
--- a/libs/community/langchain_community/embeddings/voyageai.py
+++ b/libs/community/langchain_community/embeddings/voyageai.py
@ -14,6 +14,7 @@ from typing import (
 )

 import requests
+from langchain_core._api.deprecation import deprecated
 from langchain_core.embeddings import Embeddings
 from langchain_core.pydantic_v1 import BaseModel, Extra, SecretStr, root_validator
 from langchain_core.utils import convert_to_secret_str, get_from_dict_or_env
@ -58,6 +59,11 @@ def embed_with_retry(embeddings: VoyageEmbeddings, **kwargs: Any) -> Any:
    return _embed_with_retry(**kwargs)


+@deprecated(
+    since="0.0.29",
+    removal="0.2",
+    alternative_import="langchain_voyageai.VoyageAIEmbeddings",
+)
 class VoyageEmbeddings(BaseModel, Embeddings):
    """Voyage embedding models.

--- a/libs/partners/voyageai/.gitignore
+++ b/libs/partners/voyageai/.gitignore
@ -0,0 +1 @@
+__pycache__
--- a/libs/partners/voyageai/LICENSE
+++ b/libs/partners/voyageai/LICENSE
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 LangChain, Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/libs/partners/voyageai/Makefile
+++ b/libs/partners/voyageai/Makefile
@ -0,0 +1,57 @@
+.PHONY: all format lint test tests integration_tests docker_tests help extended_tests
+
+# Default target executed when no arguments are given to make.
+all: help
+
+# Define a variable for the test file path.
+TEST_FILE ?= tests/unit_tests/
+integration_test integration_tests: TEST_FILE=tests/integration_tests/
+
+test tests integration_test integration_tests:
+	poetry run pytest $(TEST_FILE)
+
+
+######################
+# LINTING AND FORMATTING
+######################
+
+# Define a variable for Python and notebook files.
+PYTHON_FILES=.
+MYPY_CACHE=.mypy_cache
+lint format: PYTHON_FILES=.
+lint_diff format_diff: PYTHON_FILES=$(shell git diff --relative=libs/partners/voyageai --name-only --diff-filter=d master | grep -E '\.py$$|\.ipynb$$')
+lint_package: PYTHON_FILES=langchain_voyageai
+lint_tests: PYTHON_FILES=tests
+lint_tests: MYPY_CACHE=.mypy_cache_test
+
+lint lint_diff lint_package lint_tests:
+	poetry run ruff .
+	poetry run ruff format $(PYTHON_FILES) --diff
+	poetry run ruff --select I $(PYTHON_FILES)
+	mkdir $(MYPY_CACHE); poetry run mypy $(PYTHON_FILES) --cache-dir $(MYPY_CACHE)
+
+format format_diff:
+	poetry run ruff format $(PYTHON_FILES)
+	poetry run ruff --select I --fix $(PYTHON_FILES)
+
+spell_check:
+	poetry run codespell --toml pyproject.toml
+
+spell_fix:
+	poetry run codespell --toml pyproject.toml -w
+
+check_imports: $(shell find langchain_voyageai -name '*.py')
+	poetry run python ./scripts/check_imports.py $^
+
+######################
+# HELP
+######################
+
+help:
+	@echo '----'
+	@echo 'check_imports				- check imports'
+	@echo 'format                       - run code formatters'
+	@echo 'lint                         - run linters'
+	@echo 'test                         - run unit tests'
+	@echo 'tests                        - run unit tests'
+	@echo 'test TEST_FILE=<test_file>   - run all tests in file'
--- a/libs/partners/voyageai/README.md
+++ b/libs/partners/voyageai/README.md
@ -0,0 +1,21 @@
+# langchain-voyageai
+
+This package contains the LangChain integrations for VoyageAI through their `voyageai` client package.
+
+## Installation and Setup
+
+- Install the LangChain partner package
+```bash
+pip install langchain-voyageai
+```
+- Get an VoyageAI api key and set it as an environment variable (`VOYAGE_API_KEY`) or use the API key as a parameter in the Client.
+
+
+
+## Text Embedding Model
+
+See a [usage example](https://python.langchain.com/docs/integrations/text_embedding/voyageai)
+
+```python
+from langchain_voyageai import VoyageAIEmbeddings
+```
--- a/libs/partners/voyageai/langchain_voyageai/init.py
+++ b/libs/partners/voyageai/langchain_voyageai/init.py
@ -0,0 +1,5 @@
+from langchain_voyageai.embeddings import VoyageAIEmbeddings
+
+__all__ = [
+    "VoyageAIEmbeddings",
+]
--- a/libs/partners/voyageai/langchain_voyageai/embeddings.py
+++ b/libs/partners/voyageai/langchain_voyageai/embeddings.py
@ -0,0 +1,130 @@
+import logging
+import os
+from typing import Iterable, List, Optional
+
+import voyageai  # type: ignore
+from langchain_core.embeddings import Embeddings
+from langchain_core.pydantic_v1 import (
+    BaseModel,
+    Extra,
+    Field,
+    SecretStr,
+    root_validator,
+)
+from langchain_core.utils import convert_to_secret_str
+
+logger = logging.getLogger(__name__)
+
+
+class VoyageAIEmbeddings(BaseModel, Embeddings):
+    """VoyageAIEmbeddings embedding model.
+
+    Example:
+        .. code-block:: python
+
+            from langchain_voyageai import VoyageAIEmbeddings
+
+            model = VoyageAIEmbeddings()
+    """
+
+    _client: voyageai.Client = Field(exclude=True)
+    _aclient: voyageai.client_async.AsyncClient = Field(exclude=True)
+    model: str
+    batch_size: int
+    show_progress_bar: bool = False
+    truncation: Optional[bool] = None
+    voyage_api_key: Optional[SecretStr] = None
+
+    class Config:
+        extra = Extra.forbid
+
+    @root_validator(pre=True)
+    def default_values(cls, values: dict) -> dict:
+        """Set default batch size based on model"""
+
+        model = values.get("model")
+        batch_size = values.get("batch_size")
+        if batch_size is None:
+            print("batch size", batch_size)
+            values["batch_size"] = 72 if model in ["voyage-2", "voyage-02"] else 7
+        return values
+
+    @root_validator()
+    def validate_environment(cls, values: dict) -> dict:
+        """Validate that VoyageAI credentials exist in environment."""
+        voyage_api_key = values.get("voyage_api_key") or os.getenv(
+            "VOYAGE_API_KEY", None
+        )
+        if voyage_api_key:
+            api_key_secretstr = convert_to_secret_str(voyage_api_key)
+            values["voyage_api_key"] = api_key_secretstr
+
+            api_key_str = api_key_secretstr.get_secret_value()
+        else:
+            api_key_str = None
+        values["_client"] = voyageai.Client(api_key=api_key_str)
+        values["_aclient"] = voyageai.client_async.AsyncClient(api_key=api_key_str)
+        return values
+
+    def _get_batch_iterator(self, texts: List[str]) -> Iterable:
+        if self.show_progress_bar:
+            try:
+                from tqdm.auto import tqdm  # type: ignore
+            except ImportError as e:
+                raise ImportError(
+                    "Must have tqdm installed if `show_progress_bar` is set to True. "
+                    "Please install with `pip install tqdm`."
+                ) from e
+
+            _iter = tqdm(range(0, len(texts), self.batch_size))
+        else:
+            _iter = range(0, len(texts), self.batch_size)  # type: ignore
+
+        return _iter
+
+    def embed_documents(self, texts: List[str]) -> List[List[float]]:
+        """Embed search docs."""
+        embeddings: List[List[float]] = []
+
+        _iter = self._get_batch_iterator(texts)
+        for i in _iter:
+            embeddings.extend(
+                self._client.embed(
+                    texts[i : i + self.batch_size],
+                    model=self.model,
+                    input_type="document",
+                    truncation=self.truncation,
+                ).embeddings
+            )
+
+        return embeddings
+
+    def embed_query(self, text: str) -> List[float]:
+        """Embed query text."""
+        return self._client.embed(
+            [text], model=self.model, input_type="query", truncation=self.truncation
+        ).embeddings[0]
+
+    async def aembed_documents(self, texts: List[str]) -> List[List[float]]:
+        embeddings: List[List[float]] = []
+
+        _iter = self._get_batch_iterator(texts)
+        for i in _iter:
+            r = await self._aclient.embed(
+                texts[i : i + self.batch_size],
+                model=self.model,
+                input_type="document",
+                truncation=self.truncation,
+            )
+            embeddings.extend(r.embeddings)
+
+        return embeddings
+
+    async def aembed_query(self, text: str) -> List[float]:
+        r = await self._aclient.embed(
+            [text],
+            model=self.model,
+            input_type="query",
+            truncation=self.truncation,
+        )
+        return r.embeddings[0]
--- a/libs/partners/voyageai/langchain_voyageai/py.typed
+++ b/libs/partners/voyageai/langchain_voyageai/py.typed
--- a/libs/partners/voyageai/poetry.lock
+++ b/libs/partners/voyageai/poetry.lock
--- a/libs/partners/voyageai/pyproject.toml
+++ b/libs/partners/voyageai/pyproject.toml
@ -0,0 +1,92 @@
+[tool.poetry]
+name = "langchain-voyageai"
+version = "0.1.0"
+description = "An integration package connecting VoyageAI and LangChain"
+authors = []
+readme = "README.md"
+repository = "https://github.com/langchain-ai/langchain"
+license = "MIT"
+
+[tool.poetry.urls]
+"Source Code" = "https://github.com/langchain-ai/langchain/tree/master/libs/partners/voyageai"
+
+[tool.poetry.dependencies]
+python = ">=3.8.1,<4.0"
+langchain-core = "^0.1.32"
+voyageai = ">=0.2.1,<1"
+
+[tool.poetry.group.test]
+optional = true
+
+[tool.poetry.group.test.dependencies]
+pytest = "^7.3.0"
+freezegun = "^1.2.2"
+pytest-mock = "^3.10.0"
+syrupy = "^4.0.2"
+pytest-watcher = "^0.3.4"
+pytest-asyncio = "^0.21.1"
+langchain-core = { path = "../../core", develop = true }
+
+[tool.poetry.group.codespell]
+optional = true
+
+[tool.poetry.group.codespell.dependencies]
+codespell = "^2.2.0"
+
+[tool.poetry.group.test_integration]
+optional = true
+
+[tool.poetry.group.test_integration.dependencies]
+
+[tool.poetry.group.lint]
+optional = true
+
+[tool.poetry.group.lint.dependencies]
+ruff = "^0.1.5"
+
+[tool.poetry.group.typing.dependencies]
+mypy = "^0.991"
+langchain-core = { path = "../../core", develop = true }
+
+[tool.poetry.group.dev]
+optional = true
+
+[tool.poetry.group.dev.dependencies]
+langchain-core = { path = "../../core", develop = true }
+
+[tool.ruff]
+select = [
+  "E", # pycodestyle
+  "F", # pyflakes
+  "I", # isort
+]
+
+[tool.mypy]
+disallow_untyped_defs = "True"
+
+[tool.coverage.run]
+omit = ["tests/*"]
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[tool.pytest.ini_options]
+# --strict-markers will raise errors on unknown marks.
+# https://docs.pytest.org/en/7.1.x/how-to/mark.html#raising-errors-on-unknown-marks
+#
+# https://docs.pytest.org/en/7.1.x/reference/reference.html
+# --strict-config       any warnings encountered while parsing the `pytest`
+#                       section of the configuration file raise errors.
+#
+# https://github.com/tophat/syrupy
+# --snapshot-warn-unused    Prints a warning on unused snapshots rather than fail the test suite.
+addopts = "--strict-markers --strict-config --durations=5"
+# Registering custom markers.
+# https://docs.pytest.org/en/7.1.x/example/markers.html#registering-markers
+markers = [
+  "requires: mark tests as requiring a specific library",
+  "asyncio: mark tests as requiring asyncio",
+  "compile: mark placeholder test used to compile integration tests without running them",
+]
+asyncio_mode = "auto"
--- a/libs/partners/voyageai/scripts/check_imports.py
+++ b/libs/partners/voyageai/scripts/check_imports.py
@ -0,0 +1,17 @@
+import sys
+import traceback
+from importlib.machinery import SourceFileLoader
+
+if __name__ == "__main__":
+    files = sys.argv[1:]
+    has_failure = False
+    for file in files:
+        try:
+            SourceFileLoader("x", file).load_module()
+        except Exception:
+            has_faillure = True
+            print(file)
+            traceback.print_exc()
+            print()
+
+    sys.exit(1 if has_failure else 0)
--- a/libs/partners/voyageai/scripts/check_pydantic.sh
+++ b/libs/partners/voyageai/scripts/check_pydantic.sh
@ -0,0 +1,27 @@
+#!/bin/bash
+#
+# This script searches for lines starting with "import pydantic" or "from pydantic"
+# in tracked files within a Git repository.
+#
+# Usage: ./scripts/check_pydantic.sh /path/to/repository
+
+# Check if a path argument is provided
+if [ $# -ne 1 ]; then
+  echo "Usage: $0 /path/to/repository"
+  exit 1
+fi
+
+repository_path="$1"
+
+# Search for lines matching the pattern within the specified repository
+result=$(git -C "$repository_path" grep -E '^import pydantic|^from pydantic')
+
+# Check if any matching lines were found
+if [ -n "$result" ]; then
+  echo "ERROR: The following lines need to be updated:"
+  echo "$result"
+  echo "Please replace the code with an import from langchain_core.pydantic_v1."
+  echo "For example, replace 'from pydantic import BaseModel'"
+  echo "with 'from langchain_core.pydantic_v1 import BaseModel'"
+  exit 1
+fi
--- a/libs/partners/voyageai/scripts/lint_imports.sh
+++ b/libs/partners/voyageai/scripts/lint_imports.sh
@ -0,0 +1,17 @@
+#!/bin/bash
+
+set -eu
+
+# Initialize a variable to keep track of errors
+errors=0
+
+# make sure not importing from langchain or langchain_experimental
+git --no-pager grep '^from langchain\.' . && errors=$((errors+1))
+git --no-pager grep '^from langchain_experimental\.' . && errors=$((errors+1))
+
+# Decide on an exit status based on the errors
+if [ "$errors" -gt 0 ]; then
+    exit 1
+else
+    exit 0
+fi
--- a/libs/partners/voyageai/tests/init.py
+++ b/libs/partners/voyageai/tests/init.py
--- a/libs/partners/voyageai/tests/integration_tests/init.py
+++ b/libs/partners/voyageai/tests/integration_tests/init.py
--- a/libs/partners/voyageai/tests/integration_tests/test_compile.py
+++ b/libs/partners/voyageai/tests/integration_tests/test_compile.py
@ -0,0 +1,7 @@
+import pytest
+
+
+@pytest.mark.compile
+def test_placeholder() -> None:
+    """Used for compiling integration tests without running any real tests."""
+    pass
--- a/libs/partners/voyageai/tests/integration_tests/test_embeddings.py
+++ b/libs/partners/voyageai/tests/integration_tests/test_embeddings.py
@ -0,0 +1,53 @@
+"""Test VoyageAI embeddings."""
+
+from langchain_voyageai import VoyageAIEmbeddings
+
+# Please set VOYAGE_API_KEY in the environment variables
+MODEL = "voyage-2"
+
+
+def test_langchain_voyageai_embedding_documents() -> None:
+    """Test voyage embeddings."""
+    documents = ["foo bar"]
+    embedding = VoyageAIEmbeddings(model=MODEL)
+    output = embedding.embed_documents(documents)
+    assert len(output) == 1
+    assert len(output[0]) == 1024
+
+
+def test_langchain_voyageai_embedding_documents_multiple() -> None:
+    """Test voyage embeddings."""
+    documents = ["foo bar", "bar foo", "foo"]
+    embedding = VoyageAIEmbeddings(model=MODEL, batch_size=2)
+    output = embedding.embed_documents(documents)
+    assert len(output) == 3
+    assert len(output[0]) == 1024
+    assert len(output[1]) == 1024
+    assert len(output[2]) == 1024
+
+
+def test_langchain_voyageai_embedding_query() -> None:
+    """Test voyage embeddings."""
+    document = "foo bar"
+    embedding = VoyageAIEmbeddings(model=MODEL)
+    output = embedding.embed_query(document)
+    assert len(output) == 1024
+
+
+async def test_langchain_voyageai_async_embedding_documents_multiple() -> None:
+    """Test voyage embeddings."""
+    documents = ["foo bar", "bar foo", "foo"]
+    embedding = VoyageAIEmbeddings(model=MODEL, batch_size=2)
+    output = await embedding.aembed_documents(documents)
+    assert len(output) == 3
+    assert len(output[0]) == 1024
+    assert len(output[1]) == 1024
+    assert len(output[2]) == 1024
+
+
+async def test_langchain_voyageai_async_embedding_query() -> None:
+    """Test voyage embeddings."""
+    document = "foo bar"
+    embedding = VoyageAIEmbeddings(model=MODEL)
+    output = await embedding.aembed_query(document)
+    assert len(output) == 1024
--- a/libs/partners/voyageai/tests/unit_tests/init.py
+++ b/libs/partners/voyageai/tests/unit_tests/init.py
--- a/libs/partners/voyageai/tests/unit_tests/test_embeddings.py
+++ b/libs/partners/voyageai/tests/unit_tests/test_embeddings.py
@ -0,0 +1,36 @@
+"""Test embedding model integration."""
+
+from langchain_core.embeddings import Embeddings
+
+from langchain_voyageai import VoyageAIEmbeddings
+
+MODEL = "voyage-2"
+
+
+def test_initialization_voyage_2() -> None:
+    """Test embedding model initialization."""
+    emb = VoyageAIEmbeddings(voyage_api_key="NOT_A_VALID_KEY", model=MODEL)
+    assert isinstance(emb, Embeddings)
+    assert emb.batch_size == 72
+    assert emb.model == MODEL
+    assert emb._client is not None
+
+
+def test_initialization_voyage_1() -> None:
+    """Test embedding model initialization."""
+    emb = VoyageAIEmbeddings(voyage_api_key="NOT_A_VALID_KEY", model="voyage-01")
+    assert isinstance(emb, Embeddings)
+    assert emb.batch_size == 7
+    assert emb.model == "voyage-01"
+    assert emb._client is not None
+
+
+def test_initialization_voyage_1_batch_size() -> None:
+    """Test embedding model initialization."""
+    emb = VoyageAIEmbeddings(
+        voyage_api_key="NOT_A_VALID_KEY", model="voyage-01", batch_size=15
+    )
+    assert isinstance(emb, Embeddings)
+    assert emb.batch_size == 15
+    assert emb.model == "voyage-01"
+    assert emb._client is not None
--- a/libs/partners/voyageai/tests/unit_tests/test_imports.py
+++ b/libs/partners/voyageai/tests/unit_tests/test_imports.py
@ -0,0 +1,9 @@
+from langchain_voyageai import __all__
+
+EXPECTED_ALL = [
+    "VoyageAIEmbeddings",
+]
+
+
+def test_all_imports() -> None:
+    assert sorted(EXPECTED_ALL) == sorted(__all__)