mirror of
https://github.com/hwchase17/langchain
synced 2024-10-29 17:07:25 +00:00
427551eabf
## DocArray as a Retriever [DocArray](https://github.com/docarray/docarray) is an open-source tool for managing your multi-modal data. It offers flexibility to store and search through your data using various document index backends. This PR introduces `DocArrayRetriever` - which works with any available backend and serves as a retriever for Langchain apps. Also, I added 2 notebooks: DocArray Backends - intro to all 5 currently supported backends, how to initialize, index, and use them as a retriever DocArray Usage - showcasing what additional search parameters you can pass to create versatile retrievers Example: ```python from docarray.index import InMemoryExactNNIndex from docarray import BaseDoc, DocList from docarray.typing import NdArray from langchain.embeddings.openai import OpenAIEmbeddings from langchain.retrievers import DocArrayRetriever # define document schema class MyDoc(BaseDoc): description: str description_embedding: NdArray[1536] embeddings = OpenAIEmbeddings() # create documents descriptions = ["description 1", "description 2"] desc_embeddings = embeddings.embed_documents(texts=descriptions) docs = DocList[MyDoc]( [ MyDoc(description=desc, description_embedding=embedding) for desc, embedding in zip(descriptions, desc_embeddings) ] ) # initialize document index with data db = InMemoryExactNNIndex[MyDoc](docs) # create a retriever retriever = DocArrayRetriever( index=db, embeddings=embeddings, search_field="description_embedding", content_field="description", ) # find the relevant document doc = retriever.get_relevant_documents("action movies") print(doc) ``` #### Who can review? @dev2049 --------- Signed-off-by: jupyterjazz <saba.sturua@jina.ai>
196 lines
5.4 KiB
Python
196 lines
5.4 KiB
Python
from pathlib import Path
|
|
from typing import Any, Dict, Generator, Tuple
|
|
|
|
import numpy as np
|
|
import pytest
|
|
from docarray import BaseDoc
|
|
from docarray.index import (
|
|
ElasticDocIndex,
|
|
HnswDocumentIndex,
|
|
InMemoryExactNNIndex,
|
|
QdrantDocumentIndex,
|
|
WeaviateDocumentIndex,
|
|
)
|
|
from docarray.typing import NdArray
|
|
from pydantic import Field
|
|
from qdrant_client.http import models as rest
|
|
|
|
from langchain.embeddings import FakeEmbeddings
|
|
|
|
|
|
class MyDoc(BaseDoc):
|
|
title: str
|
|
title_embedding: NdArray[32] # type: ignore
|
|
other_emb: NdArray[32] # type: ignore
|
|
year: int
|
|
|
|
|
|
class WeaviateDoc(BaseDoc):
|
|
# When initializing the Weaviate index, denote the field
|
|
# you want to search on with `is_embedding=True`
|
|
title: str
|
|
title_embedding: NdArray[32] = Field(is_embedding=True) # type: ignore
|
|
other_emb: NdArray[32] # type: ignore
|
|
year: int
|
|
|
|
|
|
@pytest.fixture
|
|
def init_weaviate() -> (
|
|
Generator[
|
|
Tuple[WeaviateDocumentIndex[WeaviateDoc], Dict[str, Any], FakeEmbeddings],
|
|
None,
|
|
None,
|
|
]
|
|
):
|
|
"""
|
|
cd tests/integration_tests/vectorstores/docker-compose
|
|
docker compose -f weaviate.yml up
|
|
"""
|
|
embeddings = FakeEmbeddings(size=32)
|
|
|
|
# initialize WeaviateDocumentIndex
|
|
dbconfig = WeaviateDocumentIndex.DBConfig(host="http://localhost:8080")
|
|
weaviate_db = WeaviateDocumentIndex[WeaviateDoc](
|
|
db_config=dbconfig, index_name="docarray_retriever"
|
|
)
|
|
|
|
# index data
|
|
weaviate_db.index(
|
|
[
|
|
WeaviateDoc(
|
|
title=f"My document {i}",
|
|
title_embedding=np.array(embeddings.embed_query(f"fake emb {i}")),
|
|
other_emb=np.array(embeddings.embed_query(f"other fake emb {i}")),
|
|
year=i,
|
|
)
|
|
for i in range(100)
|
|
]
|
|
)
|
|
# build a filter query
|
|
filter_query = {"path": ["year"], "operator": "LessThanEqual", "valueInt": "90"}
|
|
|
|
yield weaviate_db, filter_query, embeddings
|
|
|
|
weaviate_db._client.schema.delete_all()
|
|
|
|
|
|
@pytest.fixture
|
|
def init_elastic() -> (
|
|
Generator[Tuple[ElasticDocIndex[MyDoc], Dict[str, Any], FakeEmbeddings], None, None]
|
|
):
|
|
"""
|
|
cd tests/integration_tests/vectorstores/docker-compose
|
|
docker-compose -f elasticsearch.yml up
|
|
"""
|
|
embeddings = FakeEmbeddings(size=32)
|
|
|
|
# initialize ElasticDocIndex
|
|
elastic_db = ElasticDocIndex[MyDoc](
|
|
hosts="http://localhost:9200", index_name="docarray_retriever"
|
|
)
|
|
# index data
|
|
elastic_db.index(
|
|
[
|
|
MyDoc(
|
|
title=f"My document {i}",
|
|
title_embedding=np.array(embeddings.embed_query(f"fake emb {i}")),
|
|
other_emb=np.array(embeddings.embed_query(f"other fake emb {i}")),
|
|
year=i,
|
|
)
|
|
for i in range(100)
|
|
]
|
|
)
|
|
# build a filter query
|
|
filter_query = {"range": {"year": {"lte": 90}}}
|
|
|
|
yield elastic_db, filter_query, embeddings
|
|
|
|
elastic_db._client.indices.delete(index="docarray_retriever")
|
|
|
|
|
|
@pytest.fixture
|
|
def init_qdrant() -> Tuple[QdrantDocumentIndex[MyDoc], rest.Filter, FakeEmbeddings]:
|
|
embeddings = FakeEmbeddings(size=32)
|
|
|
|
# initialize QdrantDocumentIndex
|
|
qdrant_config = QdrantDocumentIndex.DBConfig(path=":memory:")
|
|
qdrant_db = QdrantDocumentIndex[MyDoc](qdrant_config)
|
|
# index data
|
|
qdrant_db.index(
|
|
[
|
|
MyDoc(
|
|
title=f"My document {i}",
|
|
title_embedding=np.array(embeddings.embed_query(f"fake emb {i}")),
|
|
other_emb=np.array(embeddings.embed_query(f"other fake emb {i}")),
|
|
year=i,
|
|
)
|
|
for i in range(100)
|
|
]
|
|
)
|
|
# build a filter query
|
|
filter_query = rest.Filter(
|
|
must=[
|
|
rest.FieldCondition(
|
|
key="year",
|
|
range=rest.Range(
|
|
gte=10,
|
|
lt=90,
|
|
),
|
|
)
|
|
]
|
|
)
|
|
|
|
return qdrant_db, filter_query, embeddings
|
|
|
|
|
|
@pytest.fixture
|
|
def init_in_memory() -> (
|
|
Tuple[InMemoryExactNNIndex[MyDoc], Dict[str, Any], FakeEmbeddings]
|
|
):
|
|
embeddings = FakeEmbeddings(size=32)
|
|
|
|
# initialize InMemoryExactNNIndex
|
|
in_memory_db = InMemoryExactNNIndex[MyDoc]()
|
|
# index data
|
|
in_memory_db.index(
|
|
[
|
|
MyDoc(
|
|
title=f"My document {i}",
|
|
title_embedding=np.array(embeddings.embed_query(f"fake emb {i}")),
|
|
other_emb=np.array(embeddings.embed_query(f"other fake emb {i}")),
|
|
year=i,
|
|
)
|
|
for i in range(100)
|
|
]
|
|
)
|
|
# build a filter query
|
|
filter_query = {"year": {"$lte": 90}}
|
|
|
|
return in_memory_db, filter_query, embeddings
|
|
|
|
|
|
@pytest.fixture
|
|
def init_hnsw(
|
|
tmp_path: Path,
|
|
) -> Tuple[HnswDocumentIndex[MyDoc], Dict[str, Any], FakeEmbeddings]:
|
|
embeddings = FakeEmbeddings(size=32)
|
|
|
|
# initialize InMemoryExactNNIndex
|
|
hnsw_db = HnswDocumentIndex[MyDoc](work_dir=tmp_path)
|
|
# index data
|
|
hnsw_db.index(
|
|
[
|
|
MyDoc(
|
|
title=f"My document {i}",
|
|
title_embedding=np.array(embeddings.embed_query(f"fake emb {i}")),
|
|
other_emb=np.array(embeddings.embed_query(f"other fake emb {i}")),
|
|
year=i,
|
|
)
|
|
for i in range(100)
|
|
]
|
|
)
|
|
# build a filter query
|
|
filter_query = {"year": {"$lte": 90}}
|
|
|
|
return hnsw_db, filter_query, embeddings
|