From d19f269a34105953bb37c0cca742bc099169cc43 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Mon, 27 Feb 2023 07:45:54 -0800 Subject: [PATCH] bump version to 0.0.95 (#1324) --- docs/modules/indexes/getting_started.ipynb | 61 ++++++++++++++++++++-- docs/use_cases/question_answering.md | 40 +++++++++++++- langchain/indexes/vectorstore.py | 38 ++++++++++++-- pyproject.toml | 2 +- 4 files changed, 131 insertions(+), 10 deletions(-) diff --git a/docs/modules/indexes/getting_started.ipynb b/docs/modules/indexes/getting_started.ipynb index 4a2678ee..d06178ee 100644 --- a/docs/modules/indexes/getting_started.ipynb +++ b/docs/modules/indexes/getting_started.ipynb @@ -95,7 +95,7 @@ "id": "f3493fa4", "metadata": {}, "source": [ - "Now that the index is created, we can use it in a VectorDBQAChain to ask questions of the data!" + "Now that the index is created, we can use it to ask questions of the data! Note that under the hood this is actually doing a few steps as well, which we will cover later in this guide." ] }, { @@ -107,7 +107,7 @@ { "data": { "text/plain": [ - "\" The president said that Ketanji Brown Jackson is one of the nation's top legal minds, a consensus builder, and has gained a broad range of support. He also said that she is a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers.\"" + "\" The president said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers. He also said that she is a consensus builder and has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.\"" ] }, "execution_count": 5, @@ -116,9 +116,61 @@ } ], "source": [ - "qa = VectorDBQA.from_chain_type(llm=OpenAI(), chain_type=\"stuff\", vectorstore=index)\n", "query = \"What did the president say about Ketanji Brown Jackson\"\n", - "qa.run(query)" + "index.query(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "ae46b239", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'question': 'What did the president say about Ketanji Brown Jackson',\n", + " 'answer': \" The president said that he nominated Circuit Court of Appeals Judge Ketanji Brown Jackson, one of the nation's top legal minds, to continue Justice Breyer's legacy of excellence, and that she has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.\\n\",\n", + " 'sources': '../state_of_the_union.txt'}" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "index.query_with_sources(query)" + ] + }, + { + "cell_type": "markdown", + "id": "ff100212", + "metadata": {}, + "source": [ + "What is returned from the `VectorstoreIndexCreator` is `VectorStoreIndexWrapper`, which provides these nice `query` and `query_with_sources` functionality. If we just wanted to access the vectorstore directly, we can also do that." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b04f3c10", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "index.vectorstore" ] }, { @@ -223,6 +275,7 @@ "id": "30c4e5c6", "metadata": {}, "source": [ + "So that's creating the index.\n", "Then, as before, we create a chain and use it to answer questions!" ] }, diff --git a/docs/use_cases/question_answering.md b/docs/use_cases/question_answering.md index 68078cde..4632b5b8 100644 --- a/docs/use_cases/question_answering.md +++ b/docs/use_cases/question_answering.md @@ -1,5 +1,41 @@ # Question Answering +Question answering in this context refers to question answering over your document data. +For question answering over other types of data, like [SQL databases](../modules/chains/examples/sqlite.html) or [APIs](../modules/chains/examples/api.html), please see [here](../modules/chains/utility_how_to.html) + +For question answering over many documents, you almost always want to create an index over the data. +This can be used to smartly access the most relevant documents for a given question, allowing you to avoid having to pass all the documents to the LLM (saving you time and money). + +See [this notebook](../modules/indexes/getting_started.ipynb) for a more detailed introduction to this, but for a super quick start the steps involved are: + +**Load Your Documents** +```python +from langchain.document_loaders import TextLoader +loader = TextLoader('../state_of_the_union.txt') +``` +See [here](../modules/document_loaders/how_to_guides.rst) for more information on how to get started with document loading. + +**Create Your Index** +```python +from langchain.indexes import VectorstoreIndexCreator +index = VectorstoreIndexCreator().from_loaders([loader]) +``` +The best and most popular index by far at the moment is the VectorStore index. + +**Query Your Index** +```python +query = "What did the president say about Ketanji Brown Jackson" +index.query(query) +``` +Alternatively, use `query_with_sources` to also get back the sources involved +```python +query = "What did the president say about Ketanji Brown Jackson" +index.query_with_sources(query) +``` +Again, these high level interfaces obfuscate a lot of what is going on under the hood, so please see [this notebook](../modules/indexes/getting_started.ipynb) for a lower level walkthrough. + +## Document Question Answering + Question answering involves fetching multiple documents, and then asking a question of them. The LLM response will contain the answer to your question, based on the content of the documents. @@ -15,7 +51,7 @@ The following resources exist: - [Question Answering Notebook](/modules/indexes/chain_examples/question_answering.ipynb): A notebook walking through how to accomplish this task. - [VectorDB Question Answering Notebook](/modules/indexes/chain_examples/vector_db_qa.ipynb): A notebook walking through how to do question answering over a vector database. This can often be useful for when you have a LOT of documents, and you don't want to pass them all to the LLM, but rather first want to do some semantic search over embeddings. -### Adding in sources +## Adding in sources There is also a variant of this, where in addition to responding with the answer the language model will also cite its sources (eg which of the documents passed in it used). @@ -31,7 +67,7 @@ The following resources exist: - [QA With Sources Notebook](/modules/indexes/chain_examples/qa_with_sources.ipynb): A notebook walking through how to accomplish this task. - [VectorDB QA With Sources Notebook](/modules/indexes/chain_examples/vector_db_qa_with_sources.ipynb): A notebook walking through how to do question answering with sources over a vector database. This can often be useful for when you have a LOT of documents, and you don't want to pass them all to the LLM, but rather first want to do some semantic search over embeddings. -### Additional Related Resources +## Additional Related Resources Additional related resources include: - [Utilities for working with Documents](/modules/utils/how_to_guides.rst): Guides on how to use several of the utilities which will prove helpful for this task, including Text Splitters (for splitting up long documents) and Embeddings & Vectorstores (useful for the above Vector DB example). diff --git a/langchain/indexes/vectorstore.py b/langchain/indexes/vectorstore.py index f277247f..3b4bceea 100644 --- a/langchain/indexes/vectorstore.py +++ b/langchain/indexes/vectorstore.py @@ -1,10 +1,14 @@ -from typing import List, Type +from typing import Any, List, Optional, Type from pydantic import BaseModel, Extra, Field +from langchain.chains.qa_with_sources.vector_db import VectorDBQAWithSourcesChain +from langchain.chains.vector_db_qa.base import VectorDBQA from langchain.document_loaders.base import BaseLoader from langchain.embeddings.base import Embeddings from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.llms.base import BaseLLM +from langchain.llms.openai import OpenAI from langchain.text_splitter import RecursiveCharacterTextSplitter, TextSplitter from langchain.vectorstores.base import VectorStore from langchain.vectorstores.chroma import Chroma @@ -14,6 +18,34 @@ def _get_default_text_splitter() -> TextSplitter: return RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) +class VectorStoreIndexWrapper(BaseModel): + """Wrapper around a vectorstore for easy access.""" + + vectorstore: VectorStore + + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid + arbitrary_types_allowed = True + + def query(self, question: str, llm: Optional[BaseLLM] = None, **kwargs: Any) -> str: + """Query the vectorstore.""" + llm = llm or OpenAI(temperature=0) + chain = VectorDBQA.from_chain_type(llm, vectorstore=self.vectorstore, **kwargs) + return chain.run(question) + + def query_with_sources( + self, question: str, llm: Optional[BaseLLM] = None, **kwargs: Any + ) -> dict: + """Query the vectorstore and get back sources.""" + llm = llm or OpenAI(temperature=0) + chain = VectorDBQAWithSourcesChain.from_chain_type( + llm, vectorstore=self.vectorstore, **kwargs + ) + return chain({chain.question_key: question}) + + class VectorstoreIndexCreator(BaseModel): """Logic for creating indexes.""" @@ -27,11 +59,11 @@ class VectorstoreIndexCreator(BaseModel): extra = Extra.forbid arbitrary_types_allowed = True - def from_loaders(self, loaders: List[BaseLoader]) -> VectorStore: + def from_loaders(self, loaders: List[BaseLoader]) -> VectorStoreIndexWrapper: """Create a vectorstore index from loaders.""" docs = [] for loader in loaders: docs.extend(loader.load()) sub_docs = self.text_splitter.split_documents(docs) vectorstore = self.vectorstore_cls.from_documents(sub_docs, self.embedding) - return vectorstore + return VectorStoreIndexWrapper(vectorstore=vectorstore) diff --git a/pyproject.toml b/pyproject.toml index 33c2a203..1387ef07 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langchain" -version = "0.0.94" +version = "0.0.95" description = "Building applications with LLMs through composability" authors = [] license = "MIT"