mirror of https://github.com/hwchase17/langchain
parent document retriever (#8941)
parent
a2681f950d
commit
7de6a1b78e
@ -0,0 +1,139 @@
|
||||
import uuid
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from langchain.callbacks.base import Callbacks
|
||||
from langchain.schema.document import Document
|
||||
from langchain.schema.retriever import BaseRetriever
|
||||
from langchain.schema.storage import BaseStore
|
||||
from langchain.text_splitter import TextSplitter
|
||||
from langchain.vectorstores.base import VectorStore
|
||||
|
||||
|
||||
class ParentDocumentRetriever(BaseRetriever):
|
||||
"""Fetches small chunks, then fetches their parent documents.
|
||||
|
||||
When splitting documents for retrieval, there are often conflicting desires:
|
||||
|
||||
1. You may want to have small documents, so that their embeddings can most
|
||||
accurately reflect their meaning. If too long, then the embeddings can
|
||||
lose meaning.
|
||||
2. You want to have long enough documents that the context of each chunk is
|
||||
retained.
|
||||
|
||||
The ParentDocumentRetriever strikes that balance by splitting and storing
|
||||
small chunks of data. During retrieval, it first fetches the small chunks
|
||||
but then looks up the parent ids for those chunks and returns those larger
|
||||
documents.
|
||||
|
||||
Note that "parent document" refers to the document that a small chunk
|
||||
originated from. This can either be the whole raw document OR a larger
|
||||
chunk.
|
||||
|
||||
Examples:
|
||||
... code-block:: python
|
||||
|
||||
# Imports
|
||||
from langchain.vectorstores import Chroma
|
||||
from langchain.embeddings import OpenAIEmbeddings
|
||||
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
||||
from langchain.storage import InMemoryStore
|
||||
|
||||
# This text splitter is used to create the parent documents
|
||||
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
|
||||
# This text splitter is used to create the child documents
|
||||
# It should create documents smaller than the parent
|
||||
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
|
||||
# The vectorstore to use to index the child chunks
|
||||
vectorstore = Chroma(embedding_function=OpenAIEmbeddings())
|
||||
# The storage layer for the parent documents
|
||||
store = InMemoryStore()
|
||||
|
||||
# Initialize the retriever
|
||||
retriever = ParentDocumentRetriever(
|
||||
vectorstore=vectorstore,
|
||||
docstore=store,
|
||||
child_splitter=child_splitter,
|
||||
parent_splitter=parent_splitter,
|
||||
)
|
||||
"""
|
||||
|
||||
vectorstore: VectorStore
|
||||
"""The underlying vectorstore to use to store small chunks
|
||||
and their embedding vectors"""
|
||||
docstore: BaseStore[str, Document]
|
||||
"""The storage layer for the parent documents"""
|
||||
child_splitter: TextSplitter
|
||||
"""The text splitter to use to create child documents."""
|
||||
id_key: str = "doc_id"
|
||||
"""The key to use to track the parent id. This will be stored in the
|
||||
metadata of child documents."""
|
||||
parent_splitter: Optional[TextSplitter] = None
|
||||
"""The text splitter to use to create parent documents.
|
||||
If none, then the parent documents will be the raw documents passed in."""
|
||||
|
||||
def get_relevant_documents(
|
||||
self,
|
||||
query: str,
|
||||
*,
|
||||
callbacks: Callbacks = None,
|
||||
tags: Optional[List[str]] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
**kwargs: Any,
|
||||
) -> List[Document]:
|
||||
sub_docs = self.vectorstore.similarity_search(query)
|
||||
# We do this to maintain the order of the ids that are returned
|
||||
ids = []
|
||||
for d in sub_docs:
|
||||
if d.metadata[self.id_key] not in ids:
|
||||
ids.append(d.metadata[self.id_key])
|
||||
docs = self.docstore.mget(ids)
|
||||
return [d for d in docs if d is not None]
|
||||
|
||||
def add_documents(
|
||||
self,
|
||||
documents: List[Document],
|
||||
ids: Optional[List[str]],
|
||||
add_to_docstore: bool = True,
|
||||
) -> None:
|
||||
"""Adds documents to the docstore and vectorstores.
|
||||
|
||||
Args:
|
||||
documents: List of documents to add
|
||||
ids: Optional list of ids for documents. If provided should be the same
|
||||
length as the list of documents. Can provided if parent documents
|
||||
are already in the document store and you don't want to re-add
|
||||
to the docstore. If not provided, random UUIDs will be used as
|
||||
ids.
|
||||
add_to_docstore: Boolean of whether to add documents to docstore.
|
||||
This can be false if and only if `ids` are provided. You may want
|
||||
to set this to False if the documents are already in the docstore
|
||||
and you don't want to re-add them.
|
||||
"""
|
||||
if self.parent_splitter is not None:
|
||||
documents = self.parent_splitter.split_documents(documents)
|
||||
if ids is None:
|
||||
doc_ids = [str(uuid.uuid4()) for _ in documents]
|
||||
if not add_to_docstore:
|
||||
raise ValueError(
|
||||
"If ids are not passed in, `add_to_docstore` MUST be True"
|
||||
)
|
||||
else:
|
||||
if len(documents) != len(ids):
|
||||
raise ValueError(
|
||||
"Got uneven list of documents and ids. "
|
||||
"If `ids` is provided, should be same length as `documents`."
|
||||
)
|
||||
doc_ids = ids
|
||||
|
||||
docs = []
|
||||
full_docs = []
|
||||
for i, doc in enumerate(documents):
|
||||
_id = doc_ids[i]
|
||||
sub_docs = self.child_splitter.split_documents([doc])
|
||||
for _doc in sub_docs:
|
||||
_doc.metadata[self.id_key] = _id
|
||||
docs.extend(sub_docs)
|
||||
full_docs.append((_id, doc))
|
||||
self.vectorstore.add_documents(docs)
|
||||
if add_to_docstore:
|
||||
self.docstore.mset(full_docs)
|
Loading…
Reference in New Issue