chroma docs (#1012)

makefile-update-1
Harrison Chase 1 year ago committed by GitHub
parent 0c553d2064
commit 7fb33fca47
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

@ -0,0 +1,20 @@
# Chroma
This page covers how to use the Chroma ecosystem within LangChain.
It is broken into two parts: installation and setup, and then references to specific Chroma wrappers.
## Installation and Setup
- Install the Python package with `pip install chromadb`
## Wrappers
### VectorStore
There exists a wrapper around Chroma vector databases, allowing you to use it as a vectorstore,
whether for semantic search or example selection.
To import this vectorstore:
```python
from langchain.vectorstores import Chroma
```
For a more detailed walkthrough of the Chroma wrapper, see [this notebook](../modules/utils/combine_docs_examples/vectorstores.ipynb)

@ -1,7 +1,6 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"id": "134a0785",
"metadata": {},
@ -19,11 +18,10 @@
"outputs": [],
"source": [
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.vectorstores.faiss import FAISS\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.llms import OpenAI\n",
"from langchain.chains import ChatVectorDBChain\n",
"from langchain.document_loaders import TextLoader"
"from langchain.chains import ChatVectorDBChain"
]
},
{
@ -41,6 +39,7 @@
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import TextLoader\n",
"loader = TextLoader('../../state_of_the_union.txt')\n",
"documents = loader.load()"
]
@ -76,16 +75,25 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 3,
"id": "a8930cf7",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n"
]
}
],
"source": [
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"documents = text_splitter.split_documents(documents)\n",
"\n",
"embeddings = OpenAIEmbeddings()\n",
"vectorstore = FAISS.from_documents(documents, embeddings)"
"vectorstore = Chroma.from_documents(documents, embeddings)"
]
},
{

@ -21,7 +21,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 4,
"id": "78f28130",
"metadata": {},
"outputs": [],
@ -30,14 +30,14 @@
"from langchain.embeddings.cohere import CohereEmbeddings\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch\n",
"from langchain.vectorstores.faiss import FAISS\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.docstore.document import Document\n",
"from langchain.prompts import PromptTemplate"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 5,
"id": "4da195a3",
"metadata": {},
"outputs": [],
@ -52,17 +52,26 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 6,
"id": "5ec2b55b",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n"
]
}
],
"source": [
"docsearch = FAISS.from_texts(texts, embeddings, metadatas=[{\"source\": i} for i in range(len(texts))])"
"docsearch = Chroma.from_texts(texts, embeddings, metadatas=[{\"source\": str(i)} for i in range(len(texts))])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 7,
"id": "5286f58f",
"metadata": {},
"outputs": [],
@ -73,7 +82,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 8,
"id": "005a47e9",
"metadata": {},
"outputs": [],
@ -93,7 +102,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 9,
"id": "3722373b",
"metadata": {},
"outputs": [
@ -103,7 +112,7 @@
"{'output_text': ' The president thanked Justice Breyer for his service.\\nSOURCES: 30-pl'}"
]
},
"execution_count": 13,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@ -699,7 +708,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
},
"vscode": {
"interpreter": {

@ -28,7 +28,7 @@
"source": [
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores.faiss import FAISS\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.docstore.document import Document\n",
"from langchain.prompts import PromptTemplate"
]
@ -40,27 +40,37 @@
"metadata": {},
"outputs": [],
"source": [
"with open('../../state_of_the_union.txt') as f:\n",
" state_of_the_union = f.read()\n",
"from langchain.document_loaders import TextLoader\n",
"loader = TextLoader('../../state_of_the_union.txt')\n",
"documents = loader.load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"texts = text_splitter.split_text(state_of_the_union)\n",
"texts = text_splitter.split_documents(documents)\n",
"\n",
"embeddings = OpenAIEmbeddings()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"id": "fd9666a9",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n"
]
}
],
"source": [
"docsearch = FAISS.from_texts(texts, embeddings)"
"docsearch = Chroma.from_documents(texts, embeddings)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"id": "d1eaf6e6",
"metadata": {},
"outputs": [],
@ -673,7 +683,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
},
"vscode": {
"interpreter": {

@ -18,7 +18,7 @@
"outputs": [],
"source": [
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.vectorstores.faiss import FAISS\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain import OpenAI, VectorDBQA"
]
@ -28,15 +28,25 @@
"execution_count": 2,
"id": "5c7049db",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n"
]
}
],
"source": [
"with open('../../state_of_the_union.txt') as f:\n",
" state_of_the_union = f.read()\n",
"from langchain.document_loaders import TextLoader\n",
"loader = TextLoader('../../state_of_the_union.txt')\n",
"documents = loader.load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"texts = text_splitter.split_text(state_of_the_union)\n",
"texts = text_splitter.split_documents(documents)\n",
"\n",
"embeddings = OpenAIEmbeddings()\n",
"docsearch = FAISS.from_texts(texts, embeddings)"
"docsearch = Chroma.from_documents(texts, embeddings)"
]
},
{
@ -58,7 +68,7 @@
{
"data": {
"text/plain": [
"\" The president said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, from a family of public school educators and police officers, a consensus builder, and has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.\""
"\" The president said that Ketanji Brown Jackson is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers. He also said that she is a consensus builder, and has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.\""
]
},
"execution_count": 4,
@ -256,7 +266,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
},
"vscode": {
"interpreter": {

@ -21,7 +21,7 @@
"from langchain.embeddings.cohere import CohereEmbeddings\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch\n",
"from langchain.vectorstores.faiss import FAISS"
"from langchain.vectorstores import Chromaoma"
]
},
{
@ -41,29 +41,27 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 5,
"id": "0e745d99",
"metadata": {},
"outputs": [],
"source": [
"docsearch = FAISS.from_texts(texts, embeddings)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "f42d79dc",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n",
"Exiting: Cleaning up .chroma directory\n"
]
}
],
"source": [
"# Add in a fake source information\n",
"for i, d in enumerate(docsearch.docstore._dict.values()):\n",
" d.metadata = {'source': f\"{i}-pl\"}"
"docsearch = Chroma.from_texts(texts, embeddings, metadatas=[{\"source\": f\"{i}-pl\"} for i in range(len(texts))])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"id": "8aa571ae",
"metadata": {},
"outputs": [],
@ -73,7 +71,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "aa859d4c",
"metadata": {},
"outputs": [],
@ -85,18 +83,18 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 8,
"id": "8ba36fa7",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'answer': ' The president thanked Justice Breyer for his service.\\n',\n",
"{'answer': ' The president thanked Justice Breyer for his service and mentioned his legacy of excellence.\\n',\n",
" 'sources': '30-pl'}"
]
},
"execution_count": 7,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@ -207,7 +205,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
},
"vscode": {
"interpreter": {

@ -28,7 +28,7 @@
"from langchain.docstore.document import Document\n",
"import requests\n",
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.vectorstores.faiss import FAISS\n",
"from langchain.vectorstores import Chromama\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.prompts import PromptTemplate\n",
"import pathlib\n",
@ -96,7 +96,7 @@
"metadata": {},
"outputs": [],
"source": [
"search_index = FAISS.from_documents(source_chunks, OpenAIEmbeddings())"
"search_index = Chroma.from_documents(source_chunks, OpenAIEmbeddings())"
]
},
{
@ -191,7 +191,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
}
},
"nbformat": 4,

@ -12,7 +12,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 5,
"id": "8b54479e",
"metadata": {},
"outputs": [],
@ -65,36 +65,46 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 1,
"id": "aab39528",
"metadata": {},
"outputs": [],
"source": [
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.vectorstores.faiss import FAISS\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain import OpenAI, VectorDBQA"
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 3,
"id": "16a85d5e",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n"
]
}
],
"source": [
"with open('../../state_of_the_union.txt') as f:\n",
" state_of_the_union = f.read()\n",
"from langchain.document_loaders import TextLoader\n",
"loader = TextLoader('../../state_of_the_union.txt')\n",
"documents = loader.load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"texts = text_splitter.split_text(state_of_the_union)\n",
"texts = text_splitter.split_documents(documents)\n",
"\n",
"embeddings = OpenAIEmbeddings()\n",
"vectorstore = FAISS.from_texts(texts, embeddings)"
"vectorstore = Chroma.from_documents(texts, embeddings)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 6,
"id": "6a82e91e",
"metadata": {},
"outputs": [],
@ -104,17 +114,17 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 7,
"id": "efe9b25b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\" The president said that Jackson is one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, and from a family of public school educators and police officers, and that she has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans.\""
"\" The president said that Ketanji Brown Jackson is a Circuit Court of Appeals Judge, one of the nation's top legal minds, a former top litigator in private practice, a former federal public defender, has received a broad range of support from the Fraternal Order of Police to former judges appointed by Democrats and Republicans, and will continue Justice Breyer's legacy of excellence.\""
]
},
"execution_count": 10,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -149,7 +159,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
}
},
"nbformat": 4,

@ -33,6 +33,27 @@
"pages = loader.load_and_split()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d333cabb",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Document(page_content='LayoutParser : A Uni\\x0ced Toolkit for Deep\\nLearning Based Document Image Analysis\\nZejiang Shen1( \\x00), Ruochen Zhang2, Melissa Dell3, Benjamin Charles Germain\\nLee4, Jacob Carlson3, and Weining Li5\\n1Allen Institute for AI\\nshannons@allenai.org\\n2Brown University\\nruochen zhang@brown.edu\\n3Harvard University\\nfmelissadell,jacob carlson g@fas.harvard.edu\\n4University of Washington\\nbcgl@cs.washington.edu\\n5University of Waterloo\\nw422li@uwaterloo.ca\\nAbstract. Recent advances in document image analysis (DIA) have been\\nprimarily driven by the application of neural networks. Ideally, research\\noutcomes could be easily deployed in production and extended for further\\ninvestigation. However, various factors like loosely organized codebases\\nand sophisticated model con\\x0cgurations complicate the easy reuse of im-\\nportant innovations by a wide audience. Though there have been on-going\\ne\\x0borts to improve reusability and simplify deep learning (DL) model\\ndevelopment in disciplines like natural language processing and computer\\nvision, none of them are optimized for challenges in the domain of DIA.\\nThis represents a major gap in the existing toolkit, as DIA is central to\\nacademic research across a wide range of disciplines in the social sciences\\nand humanities. This paper introduces LayoutParser , an open-source\\nlibrary for streamlining the usage of DL in DIA research and applica-\\ntions. The core LayoutParser library comes with a set of simple and\\nintuitive interfaces for applying and customizing DL models for layout de-\\ntection, character recognition, and many other document processing tasks.\\nTo promote extensibility, LayoutParser also incorporates a community\\nplatform for sharing both pre-trained models and full document digiti-\\nzation pipelines. We demonstrate that LayoutParser is helpful for both\\nlightweight and large-scale digitization pipelines in real-word use cases.\\nThe library is publicly available at https://layout-parser.github.io .\\nKeywords: Document Image Analysis ·Deep Learning ·Layout Analysis\\n·Character Recognition ·Open Source library ·Toolkit.\\n1 Introduction\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of\\ndocument image analysis (DIA) tasks including document image classi\\x0ccation [ 11,arXiv:2103.15348v2 [cs.CV] 21 Jun 2021', lookup_str='', metadata={'source': 'example_data/layout-parser-paper.pdf', 'page': '0'}, lookup_index=0)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pages[0]"
]
},
{
"cell_type": "markdown",
"id": "ebd895e4",
@ -43,7 +64,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 9,
"id": "87fa7b3a",
"metadata": {},
"outputs": [

@ -21,7 +21,7 @@
"from langchain.embeddings.cohere import CohereEmbeddings\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores.elastic_vector_search import ElasticVectorSearch\n",
"from langchain.vectorstores.faiss import FAISS\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.docstore.document import Document"
]
},
@ -45,9 +45,18 @@
"execution_count": 4,
"id": "aa70c847",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n"
]
}
],
"source": [
"docsearch = FAISS.from_texts(texts, embeddings, metadatas=[{\"source\": i} for i in range(len(texts))])"
"docsearch = Chroma.from_texts(texts, embeddings, metadatas=[{\"source\": i} for i in range(len(texts))])"
]
},
{
@ -108,7 +117,7 @@
{
"data": {
"text/plain": [
"{'output_text': \" President Biden honored Justice Stephen Breyer, an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. He thanked Justice Breyer for his service and said that one of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. He then announced his nomination of Circuit Court of Appeals Judge Ketanji Brown Jackson to continue Justice Breyer's legacy of excellence.\"}"
"{'output_text': ' Tonight, Id like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service.'}"
]
},
"execution_count": 8,
@ -133,7 +142,7 @@
"text": [
"\n",
"Human: What did the president say about Justice Breyer\n",
"AI: President Biden honored Justice Stephen Breyer, an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. He thanked Justice Breyer for his service and said that one of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. He then announced his nomination of Circuit Court of Appeals Judge Ketanji Brown Jackson to continue Justice Breyer's legacy of excellence.\n"
"AI: Tonight, Id like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service.\n"
]
}
],
@ -166,7 +175,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
}
},
"nbformat": 4,

@ -23,7 +23,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"id": "8244ff60",
"metadata": {},
"outputs": [],
@ -43,7 +43,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 4,
"id": "7c469c95",
"metadata": {},
"outputs": [],
@ -54,7 +54,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 5,
"id": "0ec6d950",
"metadata": {},
"outputs": [],
@ -71,7 +71,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 6,
"id": "207e55f7",
"metadata": {},
"outputs": [],
@ -105,7 +105,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 7,
"id": "d00b4385",
"metadata": {},
"outputs": [
@ -142,7 +142,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 8,
"id": "878bcde9",
"metadata": {},
"outputs": [
@ -168,7 +168,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 9,
"id": "e4bebcd9",
"metadata": {},
"outputs": [
@ -220,22 +220,31 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 10,
"id": "241bfe80",
"metadata": {},
"outputs": [],
"source": [
"from langchain.prompts.example_selector import SemanticSimilarityExampleSelector\n",
"from langchain.vectorstores import FAISS\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.embeddings import OpenAIEmbeddings"
]
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 11,
"id": "50d0a701",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n"
]
}
],
"source": [
"example_selector = SemanticSimilarityExampleSelector.from_examples(\n",
" # This is the list of examples available to select from.\n",
@ -243,7 +252,7 @@
" # This is the embedding class used to produce embeddings which are used to measure semantic similarity.\n",
" OpenAIEmbeddings(), \n",
" # This is the VectorStore class that is used to store the embeddings and do a similarity search over.\n",
" FAISS, \n",
" Chroma, \n",
" # This is the number of examples to produce.\n",
" k=1\n",
")\n",
@ -259,7 +268,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 12,
"id": "4c8fdf45",
"metadata": {},
"outputs": [
@ -284,7 +293,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 13,
"id": "829af21a",
"metadata": {
"scrolled": true
@ -311,7 +320,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 14,
"id": "3c16fe23",
"metadata": {},
"outputs": [
@ -347,17 +356,18 @@
},
{
"cell_type": "code",
"execution_count": 19,
"execution_count": 18,
"id": "ac95c968",
"metadata": {},
"outputs": [],
"source": [
"from langchain.prompts.example_selector import MaxMarginalRelevanceExampleSelector"
"from langchain.prompts.example_selector import MaxMarginalRelevanceExampleSelector\n",
"from langchain.vectorstores import FAISS"
]
},
{
"cell_type": "code",
"execution_count": 20,
"execution_count": 19,
"id": "db579bea",
"metadata": {},
"outputs": [],
@ -384,7 +394,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": 20,
"id": "cd76e344",
"metadata": {},
"outputs": [
@ -412,7 +422,7 @@
},
{
"cell_type": "code",
"execution_count": 22,
"execution_count": 21,
"id": "cf82956b",
"metadata": {},
"outputs": [
@ -422,9 +432,6 @@
"text": [
"Give the antonym of every input\n",
"\n",
"Input: happy\n",
"Output: sad\n",
"\n",
"Input: enthusiastic\n",
"Output: apathetic\n",
"\n",
@ -696,7 +703,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
}
},
"nbformat": 4,

@ -242,6 +242,8 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n",
"Examples most similar to the input: Who was the father of Mary Ball Washington?\n",
"\n",
"\n",
@ -259,7 +261,7 @@
],
"source": [
"from langchain.prompts.example_selector import SemanticSimilarityExampleSelector\n",
"from langchain.vectorstores import FAISS\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.embeddings import OpenAIEmbeddings\n",
"\n",
"\n",
@ -269,7 +271,7 @@
" # This is the embedding class used to produce embeddings which are used to measure semantic similarity.\n",
" OpenAIEmbeddings(),\n",
" # This is the VectorStore class that is used to store the embeddings and do a similarity search over.\n",
" FAISS,\n",
" Chroma,\n",
" # This is the number of examples to produce.\n",
" k=1\n",
")\n",
@ -328,6 +330,14 @@
"\n",
"print(prompt.format(input=\"Who was the father of Mary Ball Washington?\"))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "84c43b97",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
@ -346,7 +356,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
},
"vscode": {
"interpreter": {

@ -71,7 +71,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 3,
"id": "094229f4",
"metadata": {},
"outputs": [],
@ -81,7 +81,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 4,
"id": "ab46bd2a",
"metadata": {},
"outputs": [
@ -91,7 +91,7 @@
"'Tell me a joke.'"
]
},
"execution_count": 2,
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@ -104,7 +104,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 5,
"id": "c3ad0fa8",
"metadata": {},
"outputs": [
@ -114,7 +114,7 @@
"'Tell me a funny joke.'"
]
},
"execution_count": 3,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@ -127,7 +127,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 6,
"id": "ba577dcf",
"metadata": {},
"outputs": [
@ -137,7 +137,7 @@
"'Tell me a funny joke about chickens.'"
]
},
"execution_count": 4,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@ -162,7 +162,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 7,
"id": "d0a0756c",
"metadata": {},
"outputs": [],
@ -173,7 +173,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 8,
"id": "59046640",
"metadata": {},
"outputs": [
@ -183,7 +183,7 @@
"PromptTemplate(input_variables=['adjective', 'content'], output_parser=None, template='Tell me a {adjective} joke about {content}.', template_format='f-string', validate_template=True)"
]
},
"execution_count": 3,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@ -204,7 +204,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 9,
"id": "53b41b6a",
"metadata": {},
"outputs": [],
@ -226,7 +226,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 10,
"id": "ba8aabd3",
"metadata": {},
"outputs": [
@ -236,7 +236,7 @@
"'\\n\\nQuestion: foo\\nAnswer: bar\\n\\nQuestion: 1\\nAnswer: 2\\n'"
]
},
"execution_count": 6,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@ -261,7 +261,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 11,
"id": "3eb36972",
"metadata": {},
"outputs": [],
@ -280,7 +280,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 12,
"id": "80a91d96",
"metadata": {},
"outputs": [],
@ -290,7 +290,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 13,
"id": "7931e5f2",
"metadata": {},
"outputs": [
@ -343,7 +343,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 14,
"id": "e710115f",
"metadata": {},
"outputs": [],
@ -353,7 +353,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 15,
"id": "5bf23a65",
"metadata": {},
"outputs": [],
@ -374,7 +374,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 16,
"id": "d4036351",
"metadata": {},
"outputs": [
@ -428,7 +428,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 17,
"id": "7c469c95",
"metadata": {},
"outputs": [],
@ -438,7 +438,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 18,
"id": "0ec6d950",
"metadata": {},
"outputs": [],
@ -455,7 +455,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 19,
"id": "207e55f7",
"metadata": {},
"outputs": [],
@ -485,7 +485,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 20,
"id": "d00b4385",
"metadata": {},
"outputs": [
@ -522,7 +522,7 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 21,
"id": "878bcde9",
"metadata": {},
"outputs": [
@ -548,7 +548,7 @@
},
{
"cell_type": "code",
"execution_count": 15,
"execution_count": 22,
"id": "e4bebcd9",
"metadata": {},
"outputs": [
@ -600,22 +600,31 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 23,
"id": "241bfe80",
"metadata": {},
"outputs": [],
"source": [
"from langchain.prompts.example_selector import SemanticSimilarityExampleSelector\n",
"from langchain.vectorstores import FAISS\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.embeddings import OpenAIEmbeddings"
]
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": 24,
"id": "50d0a701",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n"
]
}
],
"source": [
"example_selector = SemanticSimilarityExampleSelector.from_examples(\n",
" # This is the list of examples available to select from.\n",
@ -623,7 +632,7 @@
" # This is the embedding class used to produce embeddings which are used to measure semantic similarity.\n",
" OpenAIEmbeddings(), \n",
" # This is the VectorStore class that is used to store the embeddings and do a similarity search over.\n",
" FAISS, \n",
" Chroma, \n",
" # This is the number of examples to produce.\n",
" k=1\n",
")\n",
@ -639,7 +648,7 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 25,
"id": "4c8fdf45",
"metadata": {},
"outputs": [
@ -732,7 +741,8 @@
"metadata": {},
"outputs": [],
"source": [
"from langchain.prompts.example_selector import MaxMarginalRelevanceExampleSelector"
"from langchain.prompts.example_selector import MaxMarginalRelevanceExampleSelector\n",
"from langchain.vectorstores import FAISS"
]
},
{
@ -863,7 +873,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
},
"vscode": {
"interpreter": {

@ -338,7 +338,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
},
"vscode": {
"interpreter": {

@ -160,7 +160,7 @@
"outputs": [],
"source": [
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores import FAISS\n",
"from langchain.vectorstores import Chroma\n",
"\n",
"with open('../../state_of_the_union.txt') as f:\n",
" state_of_the_union = f.read()\n",
@ -173,9 +173,18 @@
"execution_count": 12,
"id": "bfcfc039",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n"
]
}
],
"source": [
"docsearch = FAISS.from_texts(texts, embeddings)\n",
"docsearch = Chroma.from_texts(texts, embeddings)\n",
"\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(query)"
@ -201,7 +210,7 @@
"\n",
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
"\n",
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence. \n"
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence.\n"
]
}
],
@ -220,7 +229,7 @@
],
"metadata": {
"kernelspec": {
"display_name": "llm-env",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
@ -234,7 +243,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.0 (default, Nov 15 2020, 06:25:35) \n[Clang 10.0.0 ]"
"version": "3.9.1"
},
"vscode": {
"interpreter": {

@ -27,7 +27,7 @@
"source": [
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS, Qdrant"
"from langchain.vectorstores import ElasticVectorSearch, Pinecone, Weaviate, FAISS, Qdrant, Chroma"
]
},
{
@ -51,16 +51,25 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 9,
"id": "015f4ff5",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n"
]
}
],
"source": [
"docsearch = FAISS.from_texts(texts, embeddings)\n",
"docsearch = Chroma.from_texts(texts, embeddings)\n",
"\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(query)"
@ -68,7 +77,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 10,
"id": "67baf32e",
"metadata": {
"pycharm": {
@ -109,17 +118,17 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 11,
"id": "70758e4f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['64108bd0-4d91-485c-9743-1e18debdd59e']"
"['a05e3d0c-ab40-11ed-a853-e65801318981']"
]
},
"execution_count": 5,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
@ -130,7 +139,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 12,
"id": "4edeb88f",
"metadata": {},
"outputs": [],
@ -141,7 +150,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 13,
"id": "1cba64a2",
"metadata": {},
"outputs": [
@ -151,7 +160,7 @@
"Document(page_content='Ankush went to Princeton', lookup_str='', metadata={}, lookup_index=0)"
]
},
"execution_count": 7,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@ -171,7 +180,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 14,
"id": "df4a459c",
"metadata": {},
"outputs": [],
@ -181,12 +190,21 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 15,
"id": "4b480245",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n"
]
}
],
"source": [
"docsearch = FAISS.from_documents(documents, embeddings)\n",
"docsearch = Chroma.from_documents(documents, embeddings)\n",
"\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"docs = docsearch.similarity_search(query)"
@ -194,7 +212,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 16,
"id": "86aa4cda",
"metadata": {},
"outputs": [
@ -212,7 +230,7 @@
"\n",
"One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n",
"\n",
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence. \n"
"And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence.\n"
]
}
],
@ -225,10 +243,28 @@
"id": "2445a5e6",
"metadata": {},
"source": [
"## FAISS-specific\n",
"## FAISS\n",
"There are some FAISS specific methods. One of them is `similarity_search_with_score`, which allows you to return not only the documents but also the similarity score of the query to them."
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "479e22ce",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Exiting: Cleaning up .chroma directory\n"
]
}
],
"source": [
"docsearch = FAISS.from_texts(texts, embeddings)"
]
},
{
"cell_type": "code",
"execution_count": 4,

@ -21,25 +21,35 @@
"outputs": [],
"source": [
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.vectorstores.faiss import FAISS\n",
"from langchain.vectorstores import Chroma\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain import OpenAI, VectorDBQA"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 3,
"id": "4fdc211d",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Running Chroma using direct local API.\n",
"Using DuckDB in-memory for database. Data will be transient.\n"
]
}
],
"source": [
"with open('../../modules/state_of_the_union.txt') as f:\n",
" state_of_the_union = f.read()\n",
"from langchain.document_loaders import TextLoader\n",
"loader = TextLoader('../../modules/state_of_the_union.txt')\n",
"documents = loader.load()\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"texts = text_splitter.split_text(state_of_the_union)\n",
"texts = text_splitter.split_documents(documents)\n",
"\n",
"embeddings = OpenAIEmbeddings()\n",
"docsearch = FAISS.from_texts(texts, embeddings)\n",
"docsearch = Chroma.from_documents(texts, embeddings)\n",
"qa = VectorDBQA.from_llm(llm=OpenAI(), vectorstore=docsearch)"
]
},
@ -57,7 +67,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 4,
"id": "3459b001",
"metadata": {},
"outputs": [],
@ -77,7 +87,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 5,
"id": "b9c3fa75",
"metadata": {},
"outputs": [],
@ -89,7 +99,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"id": "c24543a9",
"metadata": {},
"outputs": [],
@ -279,7 +289,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.9"
"version": "3.9.1"
}
},
"nbformat": 4,

@ -108,8 +108,6 @@ class Chroma(VectorStore):
query_embeddings=[query_embedding], n_results=k, where=filter
)
print(results)
docs = [
# TODO: Chroma can do batch querying,
# we shouldn't hard code to the 1st result

Loading…
Cancel
Save