mirror of
https://github.com/hwchase17/langchain
synced 2024-10-29 17:07:25 +00:00
427551eabf
## DocArray as a Retriever [DocArray](https://github.com/docarray/docarray) is an open-source tool for managing your multi-modal data. It offers flexibility to store and search through your data using various document index backends. This PR introduces `DocArrayRetriever` - which works with any available backend and serves as a retriever for Langchain apps. Also, I added 2 notebooks: DocArray Backends - intro to all 5 currently supported backends, how to initialize, index, and use them as a retriever DocArray Usage - showcasing what additional search parameters you can pass to create versatile retrievers Example: ```python from docarray.index import InMemoryExactNNIndex from docarray import BaseDoc, DocList from docarray.typing import NdArray from langchain.embeddings.openai import OpenAIEmbeddings from langchain.retrievers import DocArrayRetriever # define document schema class MyDoc(BaseDoc): description: str description_embedding: NdArray[1536] embeddings = OpenAIEmbeddings() # create documents descriptions = ["description 1", "description 2"] desc_embeddings = embeddings.embed_documents(texts=descriptions) docs = DocList[MyDoc]( [ MyDoc(description=desc, description_embedding=embedding) for desc, embedding in zip(descriptions, desc_embeddings) ] ) # initialize document index with data db = InMemoryExactNNIndex[MyDoc](docs) # create a retriever retriever = DocArrayRetriever( index=db, embeddings=embeddings, search_field="description_embedding", content_field="description", ) # find the relevant document doc = retriever.get_relevant_documents("action movies") print(doc) ``` #### Who can review? @dev2049 --------- Signed-off-by: jupyterjazz <saba.sturua@jina.ai>
72 lines
2.0 KiB
Python
72 lines
2.0 KiB
Python
from typing import Any
|
|
|
|
import pytest
|
|
from vcr.request import Request
|
|
|
|
from langchain.retrievers import DocArrayRetriever
|
|
from tests.integration_tests.retrievers.docarray.fixtures import ( # noqa: F401
|
|
init_elastic,
|
|
init_hnsw,
|
|
init_in_memory,
|
|
init_qdrant,
|
|
init_weaviate,
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"backend",
|
|
["init_hnsw", "init_in_memory", "init_qdrant", "init_elastic", "init_weaviate"],
|
|
)
|
|
def test_backends(request: Request, backend: Any) -> None:
|
|
index, filter_query, embeddings = request.getfixturevalue(backend)
|
|
|
|
# create a retriever
|
|
retriever = DocArrayRetriever(
|
|
index=index,
|
|
embeddings=embeddings,
|
|
search_field="title_embedding",
|
|
content_field="title",
|
|
)
|
|
|
|
docs = retriever.get_relevant_documents("my docs")
|
|
|
|
assert len(docs) == 1
|
|
assert "My document" in docs[0].page_content
|
|
assert "id" in docs[0].metadata and "year" in docs[0].metadata
|
|
assert "other_emb" not in docs[0].metadata
|
|
|
|
# create a retriever with filters
|
|
retriever = DocArrayRetriever(
|
|
index=index,
|
|
embeddings=embeddings,
|
|
search_field="title_embedding",
|
|
content_field="title",
|
|
filters=filter_query,
|
|
)
|
|
|
|
docs = retriever.get_relevant_documents("my docs")
|
|
|
|
assert len(docs) == 1
|
|
assert "My document" in docs[0].page_content
|
|
assert "id" in docs[0].metadata and "year" in docs[0].metadata
|
|
assert "other_emb" not in docs[0].metadata
|
|
assert docs[0].metadata["year"] <= 90
|
|
|
|
# create a retriever with MMR search
|
|
retriever = DocArrayRetriever(
|
|
index=index,
|
|
embeddings=embeddings,
|
|
search_field="title_embedding",
|
|
search_type="mmr",
|
|
content_field="title",
|
|
filters=filter_query,
|
|
)
|
|
|
|
docs = retriever.get_relevant_documents("my docs")
|
|
|
|
assert len(docs) == 1
|
|
assert "My document" in docs[0].page_content
|
|
assert "id" in docs[0].metadata and "year" in docs[0].metadata
|
|
assert "other_emb" not in docs[0].metadata
|
|
assert docs[0].metadata["year"] <= 90
|