You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
langchain/libs/community/tests/integration_tests/retrievers/docarray/fixtures.py

234 lines
6.4 KiB
Python

from __future__ import annotations
from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, Generator, Tuple
import numpy as np
import pytest
from langchain_core.pydantic_v1 import Field
if TYPE_CHECKING:
from docarray.index import (
ElasticDocIndex,
HnswDocumentIndex,
InMemoryExactNNIndex,
QdrantDocumentIndex,
WeaviateDocumentIndex,
)
from docarray.typing import NdArray
from qdrant_client.http import models as rest
from langchain_community.embeddings import FakeEmbeddings
@pytest.fixture
def init_weaviate() -> (
Generator[
Tuple[WeaviateDocumentIndex, Dict[str, Any], FakeEmbeddings],
None,
None,
]
):
"""
cd tests/integration_tests/vectorstores/docker-compose
docker compose -f weaviate.yml up
"""
from docarray import BaseDoc
from docarray.index import (
WeaviateDocumentIndex,
)
class WeaviateDoc(BaseDoc):
# When initializing the Weaviate index, denote the field
# you want to search on with `is_embedding=True`
title: str
title_embedding: NdArray[32] = Field(is_embedding=True) # type: ignore
other_emb: NdArray[32] # type: ignore
year: int
embeddings = FakeEmbeddings(size=32)
# initialize WeaviateDocumentIndex
dbconfig = WeaviateDocumentIndex.DBConfig(host="http://localhost:8080")
weaviate_db = WeaviateDocumentIndex[WeaviateDoc](
db_config=dbconfig, index_name="docarray_retriever"
)
# index data
weaviate_db.index(
[
WeaviateDoc(
title=f"My document {i}",
title_embedding=np.array(embeddings.embed_query(f"fake emb {i}")),
other_emb=np.array(embeddings.embed_query(f"other fake emb {i}")),
year=i,
)
for i in range(100)
]
)
# build a filter query
filter_query = {"path": ["year"], "operator": "LessThanEqual", "valueInt": "90"}
yield weaviate_db, filter_query, embeddings
weaviate_db._client.schema.delete_all()
@pytest.fixture
def init_elastic() -> (
Generator[Tuple[ElasticDocIndex, Dict[str, Any], FakeEmbeddings], None, None]
):
"""
cd tests/integration_tests/vectorstores/docker-compose
docker-compose -f elasticsearch.yml up
"""
from docarray import BaseDoc
from docarray.index import (
ElasticDocIndex,
)
class MyDoc(BaseDoc):
title: str
title_embedding: NdArray[32] # type: ignore
other_emb: NdArray[32] # type: ignore
year: int
embeddings = FakeEmbeddings(size=32)
# initialize ElasticDocIndex
elastic_db = ElasticDocIndex[MyDoc](
hosts="http://localhost:9200", index_name="docarray_retriever"
)
# index data
elastic_db.index(
[
MyDoc(
title=f"My document {i}",
title_embedding=np.array(embeddings.embed_query(f"fake emb {i}")),
other_emb=np.array(embeddings.embed_query(f"other fake emb {i}")),
year=i,
)
for i in range(100)
]
)
# build a filter query
filter_query = {"range": {"year": {"lte": 90}}}
yield elastic_db, filter_query, embeddings
elastic_db._client.indices.delete(index="docarray_retriever")
@pytest.fixture
def init_qdrant() -> Tuple[QdrantDocumentIndex, rest.Filter, FakeEmbeddings]:
from docarray import BaseDoc
from docarray.index import QdrantDocumentIndex
class MyDoc(BaseDoc):
title: str
title_embedding: NdArray[32] # type: ignore
other_emb: NdArray[32] # type: ignore
year: int
embeddings = FakeEmbeddings(size=32)
# initialize QdrantDocumentIndex
qdrant_config = QdrantDocumentIndex.DBConfig(path=":memory:")
qdrant_db = QdrantDocumentIndex[MyDoc](qdrant_config)
# index data
qdrant_db.index(
[
MyDoc(
title=f"My document {i}",
title_embedding=np.array(embeddings.embed_query(f"fake emb {i}")),
other_emb=np.array(embeddings.embed_query(f"other fake emb {i}")),
year=i,
)
for i in range(100)
]
)
# build a filter query
filter_query = rest.Filter(
must=[
rest.FieldCondition(
key="year",
range=rest.Range(
gte=10,
lt=90,
),
)
]
)
return qdrant_db, filter_query, embeddings
@pytest.fixture
def init_in_memory() -> Tuple[InMemoryExactNNIndex, Dict[str, Any], FakeEmbeddings]:
from docarray import BaseDoc
from docarray.index import InMemoryExactNNIndex
class MyDoc(BaseDoc):
title: str
title_embedding: NdArray[32] # type: ignore
other_emb: NdArray[32] # type: ignore
year: int
embeddings = FakeEmbeddings(size=32)
# initialize InMemoryExactNNIndex
in_memory_db = InMemoryExactNNIndex[MyDoc]()
# index data
in_memory_db.index(
[
MyDoc(
title=f"My document {i}",
title_embedding=np.array(embeddings.embed_query(f"fake emb {i}")),
other_emb=np.array(embeddings.embed_query(f"other fake emb {i}")),
year=i,
)
for i in range(100)
]
)
# build a filter query
filter_query = {"year": {"$lte": 90}}
return in_memory_db, filter_query, embeddings
@pytest.fixture
def init_hnsw(
tmp_path: Path,
) -> Tuple[HnswDocumentIndex, Dict[str, Any], FakeEmbeddings]:
from docarray import BaseDoc
from docarray.index import (
HnswDocumentIndex,
)
class MyDoc(BaseDoc):
title: str
title_embedding: NdArray[32] # type: ignore
other_emb: NdArray[32] # type: ignore
year: int
embeddings = FakeEmbeddings(size=32)
# initialize InMemoryExactNNIndex
hnsw_db = HnswDocumentIndex[MyDoc](work_dir=tmp_path)
# index data
hnsw_db.index(
[
MyDoc(
title=f"My document {i}",
title_embedding=np.array(embeddings.embed_query(f"fake emb {i}")),
other_emb=np.array(embeddings.embed_query(f"other fake emb {i}")),
year=i,
)
for i in range(100)
]
)
# build a filter query
filter_query = {"year": {"$lte": 90}}
return hnsw_db, filter_query, embeddings