multivector notebook (#9740)

This commit is contained in:
Harrison Chase 2023-08-25 07:07:27 -07:00 committed by GitHub
parent 9731ce5a40
commit 709a67d9bf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -13,7 +13,10 @@
"\n",
"- smaller chunks: split a document into smaller chunks, and embed those (this is ParentDocumentRetriever)\n",
"- summary: create a summary for each document, embed that along with (or instead of) the document\n",
"- hypothetical questions: create hypothetical questions that each document would be appropriate to answer, embed those along with (or instead of) the document"
"- hypothetical questions: create hypothetical questions that each document would be appropriate to answer, embed those along with (or instead of) the document\n",
"\n",
"\n",
"Note that this also enables another method of adding embeddings - manually. This is great because you can explicitly add questions or queries that should lead to a document being recovered, giving you more control"
]
},
{
@ -106,7 +109,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"id": "5d23247d",
"metadata": {},
"outputs": [],
@ -122,7 +125,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "92ed5861",
"metadata": {},
"outputs": [],
@ -133,17 +136,17 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"id": "8afed60c",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(page_content='Tonight, Id like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.', metadata={'doc_id': 'b4ca7817-e3fe-4103-ac81-574fb41439ef', 'source': '../../state_of_the_union.txt'})"
"Document(page_content='Tonight, Id like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court.', metadata={'doc_id': '10e9cbc0-4ba5-4d79-a09b-c033d1ba7b01', 'source': '../../state_of_the_union.txt'})"
]
},
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@ -155,7 +158,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"id": "3c9017f1",
"metadata": {},
"outputs": [
@ -165,7 +168,7 @@
"9874"
]
},
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@ -187,7 +190,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 10,
"id": "1433dff4",
"metadata": {},
"outputs": [],
@ -201,7 +204,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"id": "35b30390",
"metadata": {},
"outputs": [],
@ -216,17 +219,17 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 12,
"id": "41a2a738",
"metadata": {},
"outputs": [],
"source": [
"summaries = [chain.invoke(d) for d in docs]"
"summaries = chain.batch(docs, {\"max_concurrency\": 5})"
]
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 13,
"id": "7ac5e4b1",
"metadata": {},
"outputs": [],
@ -250,7 +253,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 15,
"id": "0d93309f",
"metadata": {},
"outputs": [],
@ -260,7 +263,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 16,
"id": "6d5edf0d",
"metadata": {},
"outputs": [],
@ -271,7 +274,20 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 17,
"id": "862ae920",
"metadata": {},
"outputs": [],
"source": [
"# # We can also add the original chunks to the vectorstore if we so want\n",
"# for i, doc in enumerate(docs):\n",
"# doc.metadata[id_key] = doc_ids[i]\n",
"# retriever.vectorstore.add_documents(docs)"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "299232d6",
"metadata": {},
"outputs": [],
@ -281,17 +297,17 @@
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 19,
"id": "10e404c0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(page_content='The document discusses various topics and proposals put forth by the President in a State of the Union address. These include the nomination of a judge for the Supreme Court, securing the border and fixing the immigration system, advancing liberty and justice for women and LGBTQ+ individuals, passing bipartisan legislation, addressing the opioid epidemic and mental health issues, supporting veterans, and ending cancer. The President expresses optimism about the future of the country and emphasizes the strength of the American people.', metadata={'doc_id': '8c7a707d-615d-42d5-919d-bc5178dd1ae4'})"
"Document(page_content=\"The document is a transcript of a speech given by the President of the United States. The President discusses several important issues and initiatives, including the nomination of a Supreme Court Justice, border security and immigration reform, protecting women's rights, advancing LGBTQ+ equality, bipartisan legislation, addressing the opioid epidemic and mental health, supporting veterans, investigating the health effects of burn pits on military personnel, ending cancer, and the strength and resilience of the American people.\", metadata={'doc_id': '79fa2e9f-28d9-4372-8af3-2caf4f1de312'})"
]
},
"execution_count": 20,
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
@ -302,7 +318,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 20,
"id": "e4cce5c2",
"metadata": {},
"outputs": [],
@ -312,7 +328,7 @@
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 21,
"id": "c8570dbb",
"metadata": {},
"outputs": [
@ -322,7 +338,7 @@
"9194"
]
},
"execution_count": 24,
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@ -340,6 +356,203 @@
"\n",
"An LLM can also be used to generate a list of hypothetical questions that could be asked of a particular document. These questions can then be embedded"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "5219b085",
"metadata": {},
"outputs": [],
"source": [
"functions = [\n",
" {\n",
" \"name\": \"hypothetical_questions\",\n",
" \"description\": \"Generate hypothetical questions\",\n",
" \"parameters\": {\n",
" \"type\": \"object\",\n",
" \"properties\": {\n",
" \"questions\": {\n",
" \"type\": \"array\",\n",
" \"items\": {\n",
" \"type\": \"string\"\n",
" },\n",
" },\n",
" },\n",
" \"required\": [\"questions\"]\n",
" }\n",
" }\n",
" ]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "523deb92",
"metadata": {},
"outputs": [],
"source": [
"from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser\n",
"chain = (\n",
" {\"doc\": lambda x: x.page_content}\n",
" # Only asking for 3 hypothetical questions, but this could be adjusted\n",
" | ChatPromptTemplate.from_template(\"Generate a list of 3 hypothetical questions that the below document could be used to answer:\\n\\n{doc}\")\n",
" | ChatOpenAI(max_retries=0, model=\"gpt-4\").bind(functions=functions, function_call={\"name\": \"hypothetical_questions\"})\n",
" | JsonKeyOutputFunctionsParser(key_name=\"questions\")\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "11d30554",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[\"What was the author's initial impression of philosophy as a field of study, and how did it change when they got to college?\",\n",
" 'Why did the author decide to switch their focus to Artificial Intelligence (AI)?',\n",
" \"What led to the author's disillusionment with the field of AI as it was practiced at the time?\"]"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chain.invoke(docs[0])"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "3eb2e48c",
"metadata": {},
"outputs": [],
"source": [
"hypothetical_questions = chain.batch(docs, {\"max_concurrency\": 5})"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "b2cd6e75",
"metadata": {},
"outputs": [],
"source": [
"# The vectorstore to use to index the child chunks\n",
"vectorstore = Chroma(\n",
" collection_name=\"hypo-questions\",\n",
" embedding_function=OpenAIEmbeddings()\n",
")\n",
"# The storage layer for the parent documents\n",
"store = InMemoryStore()\n",
"id_key = \"doc_id\"\n",
"# The retriever (empty to start)\n",
"retriever = MultiVectorRetriever(\n",
" vectorstore=vectorstore, \n",
" docstore=store, \n",
" id_key=id_key,\n",
")\n",
"doc_ids = [str(uuid.uuid4()) for _ in docs]"
]
},
{
"cell_type": "code",
"execution_count": 68,
"id": "18831b3b",
"metadata": {},
"outputs": [],
"source": [
"question_docs = []\n",
"for i, question_list in enumerate(hypothetical_questions):\n",
" question_docs.extend([Document(page_content=s,metadata={id_key: doc_ids[i]}) for s in question_list])"
]
},
{
"cell_type": "code",
"execution_count": 69,
"id": "224b24c5",
"metadata": {},
"outputs": [],
"source": [
"retriever.vectorstore.add_documents(question_docs)\n",
"retriever.docstore.mset(list(zip(doc_ids, docs)))"
]
},
{
"cell_type": "code",
"execution_count": 70,
"id": "7b442b90",
"metadata": {},
"outputs": [],
"source": [
"sub_docs = vectorstore.similarity_search(\"justice breyer\")"
]
},
{
"cell_type": "code",
"execution_count": 71,
"id": "089b5ad0",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content=\"What is the President's stance on immigration reform?\", metadata={'doc_id': '505d73e3-8350-46ec-a58e-3af032f04ab3'}),\n",
" Document(page_content=\"What is the President's stance on immigration reform?\", metadata={'doc_id': '1c9618f0-7660-4b4f-a37c-509cbbbf6dba'}),\n",
" Document(page_content=\"What is the President's stance on immigration reform?\", metadata={'doc_id': '82c08209-b904-46a8-9532-edd2380950b7'}),\n",
" Document(page_content='What measures is the President proposing to protect the rights of LGBTQ+ Americans?', metadata={'doc_id': '82c08209-b904-46a8-9532-edd2380950b7'})]"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sub_docs"
]
},
{
"cell_type": "code",
"execution_count": 72,
"id": "7594b24e",
"metadata": {},
"outputs": [],
"source": [
"retrieved_docs = retriever.get_relevant_documents(\"justice breyer\")"
]
},
{
"cell_type": "code",
"execution_count": 73,
"id": "4c120c65",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"9194"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(retrieved_docs[0].page_content)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "616cfeeb",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {