langchain/libs/community/tests/unit_tests/embeddings/test_vertexai.py

"""Test Vertex AI embeddings API wrapper.
"""

from langchain_community.embeddings import VertexAIEmbeddings


def test_split_by_punctuation() -> None:
    parts = VertexAIEmbeddings._split_by_punctuation(
        "Hello, my friend!\nHow are you?\nI have 2 news:\n\n\t- Good,\n\t- Bad."
    )
    assert parts == [
        "Hello",
        ",",
        " ",
        "my",
        " ",
        "friend",
        "!",
        "\n",
        "How",
        " ",
        "are",
        " ",
        "you",
        "?",
        "\n",
        "I",
        " ",
        "have",
        " ",
        "2",
        " ",
        "news",
        ":",
        "\n",
        "\n",
        "\t",
        "-",
        " ",
        "Good",
        ",",
        "\n",
        "\t",
        "-",
        " ",
        "Bad",
        ".",
    ]


def test_batching() -> None:
    long_text = "foo " * 500  # 1000 words, 2000 tokens
    long_texts = [long_text for _ in range(0, 250)]
    documents251 = ["foo bar" for _ in range(0, 251)]
    five_elem = VertexAIEmbeddings._prepare_batches(long_texts, 5)
    default250_elem = VertexAIEmbeddings._prepare_batches(long_texts, 250)
    batches251 = VertexAIEmbeddings._prepare_batches(documents251, 250)
    assert len(five_elem) == 50  # 250/5 items
    assert len(five_elem[0]) == 5  # 5 items per batch
    assert len(default250_elem[0]) == 10  # Should not be more than 20K tokens
    assert len(default250_elem) == 25
    assert len(batches251[0]) == 250
    assert len(batches251[1]) == 1