diff --git a/docs/.local_build.sh b/docs/.local_build.sh new file mode 100755 index 00000000..ee0b7432 --- /dev/null +++ b/docs/.local_build.sh @@ -0,0 +1,12 @@ +mkdir _dist +cp -r {docs_skeleton,snippets} _dist +mkdir -p _dist/docs_skeleton/static/api_reference +cd api_reference +poetry run make html +cp -r _build/* ../_dist/docs_skeleton/static/api_reference +cd .. +cp -r extras/* _dist/docs_skeleton/docs +cd _dist/docs_skeleton +poetry run nbdoc_build +yarn install +yarn start diff --git a/docs/extras/modules/data_connection/vectorstores/integrations/deeplake.ipynb b/docs/extras/modules/data_connection/vectorstores/integrations/deeplake.ipynb index 07a3933f..d8f8e3a7 100644 --- a/docs/extras/modules/data_connection/vectorstores/integrations/deeplake.ipynb +++ b/docs/extras/modules/data_connection/vectorstores/integrations/deeplake.ipynb @@ -26,10 +26,8 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": { - "tags": [] - }, + "execution_count": 1, + "metadata": {}, "outputs": [], "source": [ "from langchain.embeddings.openai import OpenAIEmbeddings\n", @@ -39,30 +37,23 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdin", - "output_type": "stream", - "text": [ - "OpenAI API Key: ········\n" - ] - } - ], + "outputs": [], "source": [ "import os\n", "import getpass\n", "\n", "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")\n", + "activeloop_token = getpass.getpass(\"activeloop token:\")\n", "embeddings = OpenAIEmbeddings()" ] }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "metadata": { "tags": [] }, @@ -70,7 +61,7 @@ "source": [ "from langchain.document_loaders import TextLoader\n", "\n", - "loader = TextLoader(\"../../../state_of_the_union.txt\")\n", + "loader = TextLoader(\"docs/modules/state_of_the_union.txt\")\n", "documents = loader.load()\n", "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", "docs = text_splitter.split_documents(documents)\n", @@ -87,7 +78,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 4, "metadata": { "tags": [] }, @@ -95,57 +86,27 @@ { "name": "stderr", "output_type": "stream", - "text": [ - "/home/leo/.local/lib/python3.10/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.3.2) is available. It's recommended that you update to the latest version using `pip install -U deeplake`.\n", - " warnings.warn(\n" - ] + "text": [] }, { "name": "stdout", "output_type": "stream", "text": [ - "./my_deeplake/ loaded successfully.\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Evaluating ingest: 100%|██████████████████████████████████████| 1/1 [00:07<00:00\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset(path='./my_deeplake/', tensors=['embedding', 'ids', 'metadata', 'text'])\n", + "Dataset(path='./my_deeplake/', tensors=['embedding', 'id', 'metadata', 'text'])\n", "\n", - " tensor htype shape dtype compression\n", - " ------- ------- ------- ------- ------- \n", - " embedding generic (42, 1536) float32 None \n", - " ids text (42, 1) str None \n", - " metadata json (42, 1) str None \n", - " text text (42, 1) str None \n" + " tensor htype shape dtype compression\n", + " ------- ------- ------- ------- ------- \n", + " embedding embedding (42, 1536) float32 None \n", + " id text (42, 1) str None \n", + " metadata json (42, 1) str None \n", + " text text (42, 1) str None \n" ] } ], "source": [ - "db = DeepLake(dataset_path=\"./my_deeplake/\", embedding_function=embeddings)\n", + "db = DeepLake(\n", + " dataset_path=\"./my_deeplake/\", embedding_function=embeddings, overwrite=True\n", + ")\n", "db.add_documents(docs)\n", "# or shorter\n", "# db = DeepLake.from_documents(docs, dataset_path=\"./my_deeplake/\", embedding=embeddings, overwrite=True)\n", @@ -155,7 +116,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "metadata": { "tags": [] }, @@ -187,7 +148,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 6, "metadata": { "tags": [] }, @@ -195,36 +156,9 @@ { "name": "stdout", "output_type": "stream", - "text": [ - "./my_deeplake/ loaded successfully.\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [] - }, - { - "name": "stderr", - "output_type": "stream", "text": [ "Deep Lake Dataset in ./my_deeplake/ already exists, loading from the storage\n" ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset(path='./my_deeplake/', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])\n", - "\n", - " tensor htype shape dtype compression\n", - " ------- ------- ------- ------- ------- \n", - " embedding generic (42, 1536) float32 None \n", - " ids text (42, 1) str None \n", - " metadata json (42, 1) str None \n", - " text text (42, 1) str None \n" - ] } ], "source": [ @@ -250,7 +184,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 7, "metadata": { "tags": [] }, @@ -259,7 +193,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/home/leo/.local/lib/python3.10/site-packages/langchain/llms/openai.py:624: UserWarning: You are trying to use a chat model. This way of initializing it is no longer supported. Instead, please use: `from langchain.chat_models import ChatOpenAI`\n", + "/Users/adilkhansarsen/Documents/work/LangChain/langchain/langchain/llms/openai.py:751: UserWarning: You are trying to use a chat model. This way of initializing it is no longer supported. Instead, please use: `from langchain.chat_models import ChatOpenAI`\n", " warnings.warn(\n" ] } @@ -277,7 +211,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 8, "metadata": { "tags": [] }, @@ -285,10 +219,10 @@ { "data": { "text/plain": [ - "'The president nominated Ketanji Brown Jackson to serve on the United States Supreme Court. He described her as a former top litigator in private practice, a former federal public defender, a consensus builder, and from a family of public school educators and police officers. He also mentioned that she has received broad support from various groups since being nominated.'" + "'The President nominated Ketanji Brown Jackson to serve on the United States Supreme Court and spoke highly of her legal expertise and reputation as a consensus builder.'" ] }, - "execution_count": 10, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -307,35 +241,26 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 9, "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "./my_deeplake/ loaded successfully.\n" - ] - }, { "name": "stderr", "output_type": "stream", - "text": [ - "Evaluating ingest: 100%|██████████| 1/1 [00:04<00:00\n" - ] + "text": [] }, { "name": "stdout", "output_type": "stream", "text": [ - "Dataset(path='./my_deeplake/', tensors=['embedding', 'ids', 'metadata', 'text'])\n", + "Dataset(path='./my_deeplake/', tensors=['embedding', 'id', 'metadata', 'text'])\n", "\n", - " tensor htype shape dtype compression\n", - " ------- ------- ------- ------- ------- \n", - " embedding generic (4, 1536) float32 None \n", - " ids text (4, 1) str None \n", - " metadata json (4, 1) str None \n", - " text text (4, 1) str None \n" + " tensor htype shape dtype compression\n", + " ------- ------- ------- ------- ------- \n", + " embedding embedding (4, 1536) float32 None \n", + " id text (4, 1) str None \n", + " metadata json (4, 1) str None \n", + " text text (4, 1) str None \n" ] }, { @@ -357,31 +282,33 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 4/4 [00:00<00:00, 1080.24it/s]\n" + "100%|██████████| 4/4 [00:00<00:00, 3300.00it/s]\n" ] }, { "data": { "text/plain": [ - "[Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013}),\n", - " Document(page_content='And for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \\n\\nAs I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \\n\\nWhile it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \\n\\nAnd soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \\n\\nSo tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together. \\n\\nFirst, beat the opioid epidemic.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013})]" + "[Document(lc_kwargs={'page_content': 'Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', 'metadata': {'source': 'docs/modules/state_of_the_union.txt', 'year': 2013}}, page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': 'docs/modules/state_of_the_union.txt', 'year': 2013}),\n", + " Document(lc_kwargs={'page_content': 'A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', 'metadata': {'source': 'docs/modules/state_of_the_union.txt', 'year': 2013}}, page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', metadata={'source': 'docs/modules/state_of_the_union.txt', 'year': 2013}),\n", + " Document(lc_kwargs={'page_content': 'Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers. \\n\\nAnd as Wall Street firms take over more nursing homes, quality in those homes has gone down and costs have gone up. \\n\\nThat ends on my watch. \\n\\nMedicare is going to set higher standards for nursing homes and make sure your loved ones get the care they deserve and expect. \\n\\nWe’ll also cut costs and keep the economy going strong by giving workers a fair shot, provide more training and apprenticeships, hire them based on their skills not degrees. \\n\\nLet’s pass the Paycheck Fairness Act and paid leave. \\n\\nRaise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. \\n\\nLet’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community colleges.', 'metadata': {'source': 'docs/modules/state_of_the_union.txt', 'year': 2013}}, page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers. \\n\\nAnd as Wall Street firms take over more nursing homes, quality in those homes has gone down and costs have gone up. \\n\\nThat ends on my watch. \\n\\nMedicare is going to set higher standards for nursing homes and make sure your loved ones get the care they deserve and expect. \\n\\nWe’ll also cut costs and keep the economy going strong by giving workers a fair shot, provide more training and apprenticeships, hire them based on their skills not degrees. \\n\\nLet’s pass the Paycheck Fairness Act and paid leave. \\n\\nRaise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. \\n\\nLet’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community colleges.', metadata={'source': 'docs/modules/state_of_the_union.txt', 'year': 2013})]" ] }, - "execution_count": 55, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "db.similarity_search(\n", - " \"What did the president say about Ketanji Brown Jackson\", filter={\"year\": 2013}\n", + " \"What did the president say about Ketanji Brown Jackson\",\n", + " filter={\"metadata\": {\"year\": 2013}},\n", ")" ] }, @@ -395,19 +322,19 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013}),\n", - " Document(page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2012}),\n", - " Document(page_content='And for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \\n\\nAs I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \\n\\nWhile it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \\n\\nAnd soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \\n\\nSo tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together. \\n\\nFirst, beat the opioid epidemic.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013}),\n", - " Document(page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers. \\n\\nAnd as Wall Street firms take over more nursing homes, quality in those homes has gone down and costs have gone up. \\n\\nThat ends on my watch. \\n\\nMedicare is going to set higher standards for nursing homes and make sure your loved ones get the care they deserve and expect. \\n\\nWe’ll also cut costs and keep the economy going strong by giving workers a fair shot, provide more training and apprenticeships, hire them based on their skills not degrees. \\n\\nLet’s pass the Paycheck Fairness Act and paid leave. \\n\\nRaise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. \\n\\nLet’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community colleges.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2012})]" + "[Document(lc_kwargs={'page_content': 'Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', 'metadata': {'source': 'docs/modules/state_of_the_union.txt', 'year': 2013}}, page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': 'docs/modules/state_of_the_union.txt', 'year': 2013}),\n", + " Document(lc_kwargs={'page_content': 'A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', 'metadata': {'source': 'docs/modules/state_of_the_union.txt', 'year': 2013}}, page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', metadata={'source': 'docs/modules/state_of_the_union.txt', 'year': 2013}),\n", + " Document(lc_kwargs={'page_content': 'Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers. \\n\\nAnd as Wall Street firms take over more nursing homes, quality in those homes has gone down and costs have gone up. \\n\\nThat ends on my watch. \\n\\nMedicare is going to set higher standards for nursing homes and make sure your loved ones get the care they deserve and expect. \\n\\nWe’ll also cut costs and keep the economy going strong by giving workers a fair shot, provide more training and apprenticeships, hire them based on their skills not degrees. \\n\\nLet’s pass the Paycheck Fairness Act and paid leave. \\n\\nRaise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. \\n\\nLet’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community colleges.', 'metadata': {'source': 'docs/modules/state_of_the_union.txt', 'year': 2013}}, page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers. \\n\\nAnd as Wall Street firms take over more nursing homes, quality in those homes has gone down and costs have gone up. \\n\\nThat ends on my watch. \\n\\nMedicare is going to set higher standards for nursing homes and make sure your loved ones get the care they deserve and expect. \\n\\nWe’ll also cut costs and keep the economy going strong by giving workers a fair shot, provide more training and apprenticeships, hire them based on their skills not degrees. \\n\\nLet’s pass the Paycheck Fairness Act and paid leave. \\n\\nRaise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. \\n\\nLet’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community colleges.', metadata={'source': 'docs/modules/state_of_the_union.txt', 'year': 2013}),\n", + " Document(lc_kwargs={'page_content': 'And for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \\n\\nAs I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \\n\\nWhile it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \\n\\nAnd soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \\n\\nSo tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together. \\n\\nFirst, beat the opioid epidemic.', 'metadata': {'source': 'docs/modules/state_of_the_union.txt', 'year': 2012}}, page_content='And for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \\n\\nAs I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \\n\\nWhile it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \\n\\nAnd soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \\n\\nSo tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together. \\n\\nFirst, beat the opioid epidemic.', metadata={'source': 'docs/modules/state_of_the_union.txt', 'year': 2012})]" ] }, - "execution_count": 56, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -428,19 +355,19 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013}),\n", - " Document(page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers. \\n\\nAnd as Wall Street firms take over more nursing homes, quality in those homes has gone down and costs have gone up. \\n\\nThat ends on my watch. \\n\\nMedicare is going to set higher standards for nursing homes and make sure your loved ones get the care they deserve and expect. \\n\\nWe’ll also cut costs and keep the economy going strong by giving workers a fair shot, provide more training and apprenticeships, hire them based on their skills not degrees. \\n\\nLet’s pass the Paycheck Fairness Act and paid leave. \\n\\nRaise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. \\n\\nLet’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community colleges.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2012}),\n", - " Document(page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2012}),\n", - " Document(page_content='And for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \\n\\nAs I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \\n\\nWhile it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \\n\\nAnd soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \\n\\nSo tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together. \\n\\nFirst, beat the opioid epidemic.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013})]" + "[Document(lc_kwargs={'page_content': 'Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', 'metadata': {'source': 'docs/modules/state_of_the_union.txt', 'year': 2013}}, page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': 'docs/modules/state_of_the_union.txt', 'year': 2013}),\n", + " Document(lc_kwargs={'page_content': 'Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers. \\n\\nAnd as Wall Street firms take over more nursing homes, quality in those homes has gone down and costs have gone up. \\n\\nThat ends on my watch. \\n\\nMedicare is going to set higher standards for nursing homes and make sure your loved ones get the care they deserve and expect. \\n\\nWe’ll also cut costs and keep the economy going strong by giving workers a fair shot, provide more training and apprenticeships, hire them based on their skills not degrees. \\n\\nLet’s pass the Paycheck Fairness Act and paid leave. \\n\\nRaise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. \\n\\nLet’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community colleges.', 'metadata': {'source': 'docs/modules/state_of_the_union.txt', 'year': 2013}}, page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers. \\n\\nAnd as Wall Street firms take over more nursing homes, quality in those homes has gone down and costs have gone up. \\n\\nThat ends on my watch. \\n\\nMedicare is going to set higher standards for nursing homes and make sure your loved ones get the care they deserve and expect. \\n\\nWe’ll also cut costs and keep the economy going strong by giving workers a fair shot, provide more training and apprenticeships, hire them based on their skills not degrees. \\n\\nLet’s pass the Paycheck Fairness Act and paid leave. \\n\\nRaise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. \\n\\nLet’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community colleges.', metadata={'source': 'docs/modules/state_of_the_union.txt', 'year': 2013}),\n", + " Document(lc_kwargs={'page_content': 'A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', 'metadata': {'source': 'docs/modules/state_of_the_union.txt', 'year': 2013}}, page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', metadata={'source': 'docs/modules/state_of_the_union.txt', 'year': 2013}),\n", + " Document(lc_kwargs={'page_content': 'And for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \\n\\nAs I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \\n\\nWhile it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \\n\\nAnd soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \\n\\nSo tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together. \\n\\nFirst, beat the opioid epidemic.', 'metadata': {'source': 'docs/modules/state_of_the_union.txt', 'year': 2012}}, page_content='And for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \\n\\nAs I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \\n\\nWhile it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \\n\\nAnd soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \\n\\nSo tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together. \\n\\nFirst, beat the opioid epidemic.', metadata={'source': 'docs/modules/state_of_the_union.txt', 'year': 2012})]" ] }, - "execution_count": 57, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -460,9 +387,15 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 13, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [] + } + ], "source": [ "db.delete_dataset()" ] @@ -476,7 +409,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -499,16 +432,23 @@ }, { "cell_type": "code", - "execution_count": 62, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "os.environ[\"ACTIVELOOP_TOKEN\"] = getpass.getpass(\"Activeloop Token:\")" + "os.environ[\"ACTIVELOOP_TOKEN\"] = activeloop_token" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Deeplake now supports running the inference in 3 modes. `python` naive way of searching inside of the data, `tensor_db` which is managed database, it runs tql on a remote optimized engine and sends results back, and `compute_engine` which is C++ implementation of search that runs locally." ] }, { "cell_type": "code", - "execution_count": 63, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -516,43 +456,85 @@ "output_type": "stream", "text": [ "Your Deep Lake dataset has been successfully created!\n", - "The dataset is private so make sure you are logged in!\n", - "This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/davitbun/langchain_test\n", - "hub://davitbun/langchain_test loaded successfully.\n" + "The dataset is private so make sure you are logged in!\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Evaluating ingest: 100%|██████████| 1/1 [00:14<00:00\n", - " \r" + "-" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Dataset(path='hub://davitbun/langchain_test', tensors=['embedding', 'ids', 'metadata', 'text'])\n", + "Dataset(path='hub://adilkhan/langchain_testing_python', tensors=['embedding', 'id', 'metadata', 'text'])\n", "\n", - " tensor htype shape dtype compression\n", - " ------- ------- ------- ------- ------- \n", - " embedding generic (4, 1536) float32 None \n", - " ids text (4, 1) str None \n", - " metadata json (4, 1) str None \n", - " text text (4, 1) str None \n" + " tensor htype shape dtype compression\n", + " ------- ------- ------- ------- ------- \n", + " embedding embedding (42, 1536) float32 None \n", + " id text (42, 1) str None \n", + " metadata json (42, 1) str None \n", + " text text (42, 1) str None \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" ] }, { "data": { "text/plain": [ - "['d6d6ccb4-e187-11ed-b66d-41c5f7b85421',\n", - " 'd6d6ccb5-e187-11ed-b66d-41c5f7b85421',\n", - " 'd6d6ccb6-e187-11ed-b66d-41c5f7b85421',\n", - " 'd6d6ccb7-e187-11ed-b66d-41c5f7b85421']" + "['d604b1ac-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b238-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b260-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b27e-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b29c-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b2ba-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b2d8-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b2f6-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b314-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b332-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b350-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b36e-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b38c-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b3a0-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b3be-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b3dc-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b3fa-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b418-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b436-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b454-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b472-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b490-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b4a4-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b4c2-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b4e0-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b4fe-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b51c-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b53a-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b558-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b576-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b594-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b5b2-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b5c6-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b5e4-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b602-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b620-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b63e-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b65c-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b67a-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b698-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b6b6-093c-11ee-bdba-76d8a30504e0',\n", + " 'd604b6d4-093c-11ee-bdba-76d8a30504e0']" ] }, - "execution_count": 63, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -560,7 +542,9 @@ "source": [ "# Embed and store the texts\n", "username = \"\" # your username on app.activeloop.ai\n", - "dataset_path = f\"hub://{username}/langchain_test\" # could be also ./local/path (much faster locally), s3://bucket/path/to/dataset, gcs://path/to/dataset, etc.\n", + "dataset_path = f\"hub://{username}/langchain_testing_python\" # could be also ./local/path (much faster locally), s3://bucket/path/to/dataset, gcs://path/to/dataset, etc.\n", + "\n", + "docs = text_splitter.split_documents(documents)\n", "\n", "embedding = OpenAIEmbeddings()\n", "db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings, overwrite=True)\n", @@ -569,7 +553,7 @@ }, { "cell_type": "code", - "execution_count": 64, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -592,6 +576,204 @@ "print(docs[0].page_content)" ] }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Your Deep Lake dataset has been successfully created!\n", + "The dataset is private so make sure you are logged in!\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "|" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset(path='hub://adilkhan/langchain_testing', tensors=['embedding', 'id', 'metadata', 'text'])\n", + "\n", + " tensor htype shape dtype compression\n", + " ------- ------- ------- ------- ------- \n", + " embedding embedding (42, 1536) float32 None \n", + " id text (42, 1) str None \n", + " metadata json (42, 1) str None \n", + " text text (42, 1) str None \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "['6584c33a-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c3ee-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c420-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c43e-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c466-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c484-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c4a2-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c4c0-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c4de-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c4fc-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c51a-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c538-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c556-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c574-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c592-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c5b0-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c5ce-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c5f6-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c614-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c632-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c646-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c66e-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c682-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c6a0-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c6be-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c6e6-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c704-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c722-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c740-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c75e-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c77c-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c79a-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c7ae-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c7cc-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c7ea-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c808-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c826-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c844-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c862-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c876-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c894-093d-11ee-bdba-76d8a30504e0',\n", + " '6584c8bc-093d-11ee-bdba-76d8a30504e0']" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Embed and store the texts\n", + "username = \"adilkhan\" # your username on app.activeloop.ai\n", + "dataset_path = f\"hub://{username}/langchain_testing\" # could be also ./local/path (much faster locally), s3://bucket/path/to/dataset, gcs://path/to/dataset, etc.\n", + "\n", + "docs = text_splitter.split_documents(documents)\n", + "\n", + "embedding = OpenAIEmbeddings()\n", + "db = DeepLake(\n", + " dataset_path=dataset_path,\n", + " embedding_function=embeddings,\n", + " overwrite=True,\n", + " exec_option=\"tensor_db\",\n", + ")\n", + "db.add_documents(docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n" + ] + } + ], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = db.similarity_search(query, exec_option=\"tensor_db\")\n", + "print(docs[0].page_content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### The difference will be apparent on a bigger datasets (~10000 rows)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### TQL Search" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "now we can use tql search with DeepLake" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [], + "source": [ + "search_id = db.vectorstore.dataset.id[0].numpy()" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "docs = db.similarity_search(\n", + " query=None,\n", + " tql_query=f\"SELECT * WHERE id == '{search_id[0]}'\",\n", + " exec_option=\"tensor_db\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(lc_kwargs={'page_content': 'Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \\n\\nLast year COVID-19 kept us apart. This year we are finally together again. \\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \\n\\nWith a duty to one another to the American people to the Constitution. \\n\\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \\n\\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \\n\\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \\n\\nHe met the Ukrainian people. \\n\\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.', 'metadata': {'source': 'docs/modules/state_of_the_union.txt'}}, page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans. \\n\\nLast year COVID-19 kept us apart. This year we are finally together again. \\n\\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans. \\n\\nWith a duty to one another to the American people to the Constitution. \\n\\nAnd with an unwavering resolve that freedom will always triumph over tyranny. \\n\\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated. \\n\\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined. \\n\\nHe met the Ukrainian people. \\n\\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.', metadata={'source': 'docs/modules/state_of_the_union.txt'})]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -668,37 +850,37 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Dataset(path='hub://davitbun/langchain_test', tensors=['embedding', 'ids', 'metadata', 'text'])\n", + "Dataset(path='hub://adilkhan/langchain_testing', tensors=['embedding', 'id', 'metadata', 'text'])\n", "\n", - " tensor htype shape dtype compression\n", - " ------- ------- ------- ------- ------- \n", - " embedding generic (4, 1536) float32 None \n", - " ids text (4, 1) str None \n", - " metadata json (4, 1) str None \n", - " text text (4, 1) str None \n" + " tensor htype shape dtype compression\n", + " ------- ------- ------- ------- ------- \n", + " embedding embedding (42, 1536) float32 None \n", + " id text (42, 1) str None \n", + " metadata json (42, 1) str None \n", + " text text (42, 1) str None \n" ] } ], "source": [ "# get structure of the dataset\n", - "db.ds.summary()" + "db.vectorstore.summary()" ] }, { "cell_type": "code", - "execution_count": 67, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "# get embeddings numpy array\n", - "embeds = db.ds.embedding.numpy()" + "embeds = db.vectorstore.dataset.embedding.numpy()" ] }, { @@ -854,18 +1036,11 @@ "db = DeepLake(dataset_path=destination, embedding_function=embeddings)\n", "db.add_documents(docs)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3.9.6 ('langchain_venv': venv)", "language": "python", "name": "python3" }, @@ -879,11 +1054,11 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.6" + "version": "3.9.6" }, "vscode": { "interpreter": { - "hash": "7b14174bb6f9d4680b62ac2a6390e1ce94fbfabf172a10844870451d539c58d6" + "hash": "0b0bacaffd430edc3085253ee7ee1bcda9f76a5e66b369dda8ba68baa6d14ba7" } } }, diff --git a/langchain/vectorstores/deeplake.py b/langchain/vectorstores/deeplake.py index 62ae2a57..4a6397b5 100644 --- a/langchain/vectorstores/deeplake.py +++ b/langchain/vectorstores/deeplake.py @@ -2,12 +2,18 @@ from __future__ import annotations import logging -import uuid -from functools import partial -from typing import Any, Callable, Dict, Iterable, List, Optional, Sequence, Tuple +from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union import numpy as np +try: + import deeplake + from deeplake.core.vectorstore import DeepLakeVectorStore + + _DEEPLAKE_INSTALLED = True +except ImportError: + _DEEPLAKE_INSTALLED = False + from langchain.docstore.document import Document from langchain.embeddings.base import Embeddings from langchain.vectorstores.base import VectorStore @@ -15,59 +21,12 @@ from langchain.vectorstores.utils import maximal_marginal_relevance logger = logging.getLogger(__name__) -distance_metric_map = { - "l2": lambda a, b: np.linalg.norm(a - b, axis=1, ord=2), - "l1": lambda a, b: np.linalg.norm(a - b, axis=1, ord=1), - "max": lambda a, b: np.linalg.norm(a - b, axis=1, ord=np.inf), - "cos": lambda a, b: np.dot(a, b.T) - / (np.linalg.norm(a) * np.linalg.norm(b, axis=1)), - "dot": lambda a, b: np.dot(a, b.T), -} - - -def vector_search( - query_embedding: np.ndarray, - data_vectors: np.ndarray, - distance_metric: str = "L2", - k: Optional[int] = 4, -) -> Tuple[List, List]: - """Naive search for nearest neighbors - - args: - query_embedding: np.ndarray - data_vectors: np.ndarray - k (int): number of nearest neighbors - distance_metric: distance function 'L2' for Euclidean, 'L1' for Nuclear, 'Max' - l-infinity distnace, 'cos' for cosine similarity, 'dot' for dot product - - returns: - nearest_indices: List, indices of nearest neighbors - """ - if data_vectors.shape[0] == 0: - return [], [] - - # Calculate the distance between the query_vector and all data_vectors - distances = distance_metric_map[distance_metric](query_embedding, data_vectors) - nearest_indices = np.argsort(distances) - - nearest_indices = ( - nearest_indices[::-1][:k] if distance_metric in ["cos"] else nearest_indices[:k] - ) - - return nearest_indices.tolist(), distances[nearest_indices].tolist() - - -def dp_filter(x: dict, filter: Dict[str, str]) -> bool: - """Filter helper function for Deep Lake""" - metadata = x["metadata"].data()["value"] - return all(k in metadata and v == metadata[k] for k, v in filter.items()) - class DeepLake(VectorStore): """Wrapper around Deep Lake, a data lake for deep learning applications. - We implement naive similarity search and filtering for fast prototyping, - but it can be extended with Tensor Query Language (TQL) for production use cases + We integrated deeplake's similarity search and filtering for fast prototyping, + Now, it supports Tensor Query Language (TQL) for production use cases over billion rows. Why Deep Lake? @@ -97,97 +56,94 @@ class DeepLake(VectorStore): dataset_path: str = _LANGCHAIN_DEFAULT_DEEPLAKE_PATH, token: Optional[str] = None, embedding_function: Optional[Embeddings] = None, - read_only: Optional[bool] = False, - ingestion_batch_size: int = 1024, + read_only: bool = False, + ingestion_batch_size: int = 1000, num_workers: int = 0, verbose: bool = True, + exec_option: str = "python", **kwargs: Any, ) -> None: - """Initialize with Deep Lake client.""" + """Creates an empty DeepLakeVectorStore or loads an existing one. + + The DeepLakeVectorStore is located at the specified ``path``. + + Examples: + >>> # Create a vector store with default tensors + >>> deeplake_vectorstore = DeepLake( + ... path = , + ... ) + >>> + >>> # Create a vector store in the Deep Lake Managed Tensor Database + >>> data = DeepLake( + ... path = "hub://org_id/dataset_name", + ... exec_option = "tensor_db", + ... ) + + Args: + dataset_path (str): Path to existing dataset or where to create + a new one. Defaults to _LANGCHAIN_DEFAULT_DEEPLAKE_PATH. + token (str, optional): Activeloop token, for fetching credentials + to the dataset at path if it is a Deep Lake dataset. + Tokens are normally autogenerated. Optional. + embedding_function (str, optional): Function to convert + either documents or query. Optional. + read_only (bool): Open dataset in read-only mode. Default is False. + ingestion_batch_size (int): During data ingestion, data is divided + into batches. Batch size is the size of each batch. + Default is 1000. + num_workers (int): Number of workers to use during data ingestion. + Default is 0. + verbose (bool): Print dataset summary after each operation. + Default is True. + exec_option (str): DeepLakeVectorStore supports 3 ways to perform + searching - "python", "compute_engine", "tensor_db". + Default is "python". + - ``python`` - Pure-python implementation that runs on the client. + WARNING: using this with big datasets can lead to memory + issues. Data can be stored anywhere. + - ``compute_engine`` - C++ implementation of the Deep Lake Compute + Engine that runs on the client. Can be used for any data stored in + or connected to Deep Lake. Not for in-memory or local datasets. + - ``tensor_db`` - Hosted Managed Tensor Database that is + responsible for storage and query execution. Only for data stored in + the Deep Lake Managed Database. Use runtime = {"db_engine": True} during + dataset creation. + **kwargs: Other optional keyword arguments. + + Raises: + ValueError: If some condition is not met. + """ + self.ingestion_batch_size = ingestion_batch_size self.num_workers = num_workers self.verbose = verbose - try: - import deeplake - from deeplake.constants import MB - except ImportError: + if _DEEPLAKE_INSTALLED is False: raise ValueError( "Could not import deeplake python package. " "Please install it with `pip install deeplake`." ) - self._deeplake = deeplake + + version = deeplake.__version__ + if version != "3.6.2": + raise ValueError( + "deeplake version should be = 3.6.3, but you've installed" + f" {version}. Consider changing deeplake version to 3.6.3 ." + ) self.dataset_path = dataset_path - creds_args = {"creds": kwargs["creds"]} if "creds" in kwargs else {} - if deeplake.exists(dataset_path, token=token, **creds_args) and not kwargs.get( - "overwrite", False - ): - if "overwrite" in kwargs: - del kwargs["overwrite"] - - self.ds = deeplake.load( - dataset_path, - token=token, - read_only=read_only, - verbose=self.verbose, - **kwargs, - ) - logger.info(f"Loading deeplake {dataset_path} from storage.") - if self.verbose: - print( - f"Deep Lake Dataset in {dataset_path} already exists, " - f"loading from the storage" - ) - self.ds.summary() - else: - if "overwrite" in kwargs: - del kwargs["overwrite"] - - self.ds = deeplake.empty( - dataset_path, - token=token, - overwrite=True, - verbose=self.verbose, - **kwargs, - ) - - with self.ds: - self.ds.create_tensor( - "text", - htype="text", - create_id_tensor=False, - create_sample_info_tensor=False, - create_shape_tensor=False, - chunk_compression="lz4", - ) - self.ds.create_tensor( - "metadata", - htype="json", - create_id_tensor=False, - create_sample_info_tensor=False, - create_shape_tensor=False, - chunk_compression="lz4", - ) - self.ds.create_tensor( - "embedding", - htype="generic", - dtype=np.float32, - create_id_tensor=False, - create_sample_info_tensor=False, - max_chunk_size=64 * MB, - create_shape_tensor=True, - ) - self.ds.create_tensor( - "ids", - htype="text", - create_id_tensor=False, - create_sample_info_tensor=False, - create_shape_tensor=False, - chunk_compression="lz4", - ) + self.vectorstore = DeepLakeVectorStore( + path=self.dataset_path, + embedding_function=embedding_function, + read_only=read_only, + token=token, + exec_option=exec_option, + verbose=verbose, + **kwargs, + ) self._embedding_function = embedding_function + self._id_tensor_name = "ids" if "ids" in self.vectorstore.tensors() else "id" def add_texts( self, @@ -198,154 +154,208 @@ class DeepLake(VectorStore): ) -> List[str]: """Run more texts through the embeddings and add to the vectorstore. + Examples: + >>> ids = deeplake_vectorstore.add_texts( + ... texts = , + ... metadatas = , + ... ids = , + ... ) + Args: texts (Iterable[str]): Texts to add to the vectorstore. metadatas (Optional[List[dict]], optional): Optional list of metadatas. ids (Optional[List[str]], optional): Optional list of IDs. + **kwargs: other optional keyword arguments. Returns: List[str]: List of IDs of the added texts. """ - - if ids is None: - ids = [str(uuid.uuid1()) for _ in texts] - - text_list = list(texts) + kwargs = {} + if ids: + if self._id_tensor_name == "ids": # for backwards compatibility + kwargs["ids"] = ids + else: + kwargs["id"] = ids if metadatas is None: - metadatas = [{}] * len(text_list) + metadatas = [{}] * len(list(texts)) - elements = list(zip(text_list, metadatas, ids)) - - @self._deeplake.compute - def ingest(sample_in: list, sample_out: list) -> None: - text_list = [s[0] for s in sample_in] - - embeds: Sequence[Optional[np.ndarray]] = [] - - if self._embedding_function is not None: - embeddings = self._embedding_function.embed_documents(text_list) - embeds = [np.array(e, dtype=np.float32) for e in embeddings] - else: - embeds = [None] * len(text_list) - - for s, e in zip(sample_in, embeds): - sample_out.append( - { - "text": s[0], - "metadata": s[1], - "ids": s[2], - "embedding": e, - } - ) - - batch_size = min(self.ingestion_batch_size, len(elements)) - if batch_size == 0: - return [] - - batched = [ - elements[i : i + batch_size] for i in range(0, len(elements), batch_size) - ] - - ingest().eval( - batched, - self.ds, - num_workers=min(self.num_workers, len(batched) // max(self.num_workers, 1)), + return self.vectorstore.add( + text=texts, + metadata=metadatas, + embedding_data=texts, + embedding_tensor="embedding", + embedding_function=kwargs.get("embedding_function") + or self._embedding_function.embed_documents, # type: ignore + return_ids=True, **kwargs, ) - self.ds.commit(allow_empty=True) - if self.verbose: - self.ds.summary() - return ids - def _search_helper( + def _search_tql( self, - query: Any[str, None] = None, - embedding: Any[float, None] = None, - k: int = 4, - distance_metric: str = "L2", - use_maximal_marginal_relevance: Optional[bool] = False, - fetch_k: Optional[int] = 20, - filter: Optional[Any[Dict[str, str], Callable, str]] = None, - return_score: Optional[bool] = False, - **kwargs: Any, + tql_query: Optional[str], + exec_option: Optional[str] = None, + return_score: bool = False, ) -> Any[List[Document], List[Tuple[Document, float]]]: - """Return docs most similar to query. + """Function for performing tql_search. Args: - query: Text to look up documents similar to. - embedding: Embedding function to use. Defaults to None. - k: Number of Documents to return. Defaults to 4. - distance_metric: `L2` for Euclidean, `L1` for Nuclear, - `max` L-infinity distance, `cos` for cosine similarity, - 'dot' for dot product. Defaults to `L2`. - filter: Attribute filter by metadata example {'key': 'value'}. It can also - take [Deep Lake filter] - (https://docs.deeplake.ai/en/latest/deeplake.core.dataset.html#deeplake.core.dataset.Dataset.filter) - Defaults to None. - maximal_marginal_relevance: Whether to use maximal marginal relevance. - Defaults to False. - fetch_k: Number of Documents to fetch to pass to MMR algorithm. - Defaults to 20. - return_score: Whether to return the score. Defaults to False. + tql_query (str): TQL Query string for direct evaluation. + Available only for `compute_engine` and `tensor_db`. + exec_option (str, optional): Supports 3 ways to search. + Could be "python", "compute_engine" or "tensor_db". Default is "python". + - ``python`` - Pure-python implementation for the client. + WARNING: not recommended for big datasets due to potential memory + issues. + - ``compute_engine`` - C++ implementation of Deep Lake Compute + Engine for the client. Not for in-memory or local datasets. + - ``tensor_db`` - Hosted Managed Tensor Database for storage + and query execution. Only for data in Deep Lake Managed Database. + Use runtime = {"db_engine": True} during dataset creation. + return_score (bool): Return score with document. Default is False. Returns: - List of Documents selected by the specified distance metric, - if return_score True, return a tuple of (Document, score) + List[Document] - A list of documents + + Raises: + ValueError: If return_score is True but some condition is not met. """ - view = self.ds - - # attribute based filtering - if filter is not None: - if isinstance(filter, dict): - filter = partial(dp_filter, filter=filter) - - view = view.filter(filter) - if len(view) == 0: - return [] - - if self._embedding_function is None: - view = view.filter(lambda x: query in x["text"].data()["value"]) - scores = [1.0] * len(view) - - if use_maximal_marginal_relevance: - raise ValueError( - "For MMR search, you must specify an embedding function on" - "creation." - ) - - else: - emb = embedding or self._embedding_function.embed_query( - query - ) # type: ignore - query_emb = np.array(emb, dtype=np.float32) - embeddings = view.embedding.numpy(fetch_chunks=True) - k_search = fetch_k if use_maximal_marginal_relevance else k - indices, scores = vector_search( - query_emb, - embeddings, - k=k_search, - distance_metric=distance_metric.lower(), - ) - - view = view[indices] - if use_maximal_marginal_relevance: - lambda_mult = kwargs.get("lambda_mult", 0.5) - indices = maximal_marginal_relevance( - query_emb, - embeddings[indices], - k=min(k, len(indices)), - lambda_mult=lambda_mult, - ) - view = view[indices] - scores = [scores[i] for i in indices] + result = self.vectorstore.search( + query=tql_query, + exec_option=exec_option, + ) + metadatas = result["metadata"] + texts = result["text"] docs = [ Document( - page_content=el["text"].data()["value"], - metadata=el["metadata"].data()["value"], + page_content=text, + metadata=metadata, ) - for el in view + for text, metadata in zip(texts, metadatas) + ] + + if return_score: + raise ValueError("scores can't be returned with tql search") + + return docs + + def _search( + self, + query: Optional[str] = None, + embedding: Optional[Union[List[float], np.ndarray]] = None, + embedding_function: Optional[Callable] = None, + k: int = 4, + distance_metric: str = "L2", + use_maximal_marginal_relevance: bool = False, + fetch_k: Optional[int] = 20, + filter: Optional[Union[Dict, Callable]] = None, + return_score: bool = False, + exec_option: Optional[str] = None, + **kwargs: Any, + ) -> Any[List[Document], List[Tuple[Document, float]]]: + """ + Return docs similar to query. + + Args: + query (str, optional): Text to look up similar docs. + embedding (Union[List[float], np.ndarray], optional): Query's embedding. + embedding_function (Callable, optional): Function to convert `query` + into embedding. + k (int): Number of Documents to return. + distance_metric (str): `L2` for Euclidean, `L1` for Nuclear, `max` + for L-infinity distance, `cos` for cosine similarity, 'dot' for dot + product. + filter (Union[Dict, Callable], optional): Additional filter prior + to the embedding search. + - ``Dict`` - Key-value search on tensors of htype json, on an + AND basis (a sample must satisfy all key-value filters to be True) + Dict = {"tensor_name_1": {"key": value}, + "tensor_name_2": {"key": value}} + - ``Function`` - Any function compatible with `deeplake.filter`. + use_maximal_marginal_relevance (bool): Use maximal marginal relevance. + fetch_k (int): Number of Documents for MMR algorithm. + return_score (bool): Return the score. + exec_option (str, optional): Supports 3 ways to perform searching. + Could be "python", "compute_engine" or "tensor_db". + - ``python`` - Pure-python implementation for the client. + WARNING: not recommended for big datasets. + - ``compute_engine`` - C++ implementation of Deep Lake Compute + Engine for the client. Not for in-memory or local datasets. + - ``tensor_db`` - Hosted Managed Tensor Database for storage + and query execution. Only for data in Deep Lake Managed Database. + Use runtime = {"db_engine": True} during dataset creation. + **kwargs: Additional keyword arguments. + + Returns: + List of Documents by the specified distance metric, + if return_score True, return a tuple of (Document, score) + + Raises: + ValueError: if both `embedding` and `embedding_function` are not specified. + """ + + if kwargs.get("tql_query"): + return self._search_tql( + tql_query=kwargs["tql_query"], + exec_option=exec_option, + return_score=return_score, + ) + + if embedding_function: + _embedding_function = embedding_function + elif self._embedding_function: + _embedding_function = self._embedding_function.embed_query + else: + _embedding_function = None + + if embedding is None: + if _embedding_function is None: + raise ValueError( + "Either `embedding` or `embedding_function` needs to be" + " specified." + ) + + embedding = _embedding_function(query) if query else None + + if isinstance(embedding, list): + embedding = np.array(embedding, dtype=np.float32) + if len(embedding.shape) > 1: + embedding = embedding[0] + + result = self.vectorstore.search( + embedding=embedding, + k=fetch_k if use_maximal_marginal_relevance else k, + distance_metric=distance_metric, + filter=filter, + exec_option=exec_option, + return_tensors=["embedding", "metadata", "text"], + ) + + scores = result["score"] + embeddings = result["embedding"] + metadatas = result["metadata"] + texts = result["text"] + + if use_maximal_marginal_relevance: + lambda_mult = kwargs.get("lambda_mult", 0.5) + indices = maximal_marginal_relevance( # type: ignore + embedding, # type: ignore + embeddings, + k=min(k, len(texts)), + lambda_mult=lambda_mult, + ) + + scores = [scores[i] for i in indices] + texts = [texts[i] for i in indices] + metadatas = [metadatas[i] for i in indices] + + docs = [ + Document( + page_content=text, + metadata=metadata, + ) + for text, metadata in zip(texts, metadatas) ] if return_score: @@ -354,76 +364,180 @@ class DeepLake(VectorStore): return docs def similarity_search( - self, query: str, k: int = 4, **kwargs: Any + self, + query: str, + k: int = 4, + **kwargs: Any, ) -> List[Document]: - """Return docs most similar to query. + """ + Return docs most similar to query. + + Examples: + >>> # Search using an embedding + >>> data = vector_store.similarity_search( + ... query=, + ... k=, + ... exec_option=, + ... ) + >>> # Run tql search: + >>> data = vector_store.tql_search( + ... tql_query="SELECT * WHERE id == ", + ... exec_option="compute_engine", + ... ) Args: - query: text to embed and run the query on. - k: Number of Documents to return. - Defaults to 4. - query: Text to look up documents similar to. - embedding: Embedding function to use. - Defaults to None. - k: Number of Documents to return. - Defaults to 4. - distance_metric: `L2` for Euclidean, `L1` for Nuclear, `max` - L-infinity distance, `cos` for cosine similarity, 'dot' for dot product - Defaults to `L2`. - filter: Attribute filter by metadata example {'key': 'value'}. - Defaults to None. - maximal_marginal_relevance: Whether to use maximal marginal relevance. - Defaults to False. - fetch_k: Number of Documents to fetch to pass to MMR algorithm. - Defaults to 20. - return_score: Whether to return the score. Defaults to False. + k (int): Number of Documents to return. Defaults to 4. + query (str): Text to look up similar documents. + **kwargs: Additional keyword arguments include: + embedding (Callable): Embedding function to use. Defaults to None. + distance_metric (str): 'L2' for Euclidean, 'L1' for Nuclear, 'max' + for L-infinity, 'cos' for cosine, 'dot' for dot product. + Defaults to 'L2'. + filter (Union[Dict, Callable], optional): Additional filter + before embedding search. + - Dict: Key-value search on tensors of htype json, + (sample must satisfy all key-value filters) + Dict = {"tensor_1": {"key": value}, "tensor_2": {"key": value}} + - Function: Compatible with `deeplake.filter`. + Defaults to None. + exec_option (str): Supports 3 ways to perform searching. + 'python', 'compute_engine', or 'tensor_db'. Defaults to 'python'. + - 'python': Pure-python implementation for the client. + WARNING: not recommended for big datasets. + - 'compute_engine': C++ implementation of the Compute Engine for + the client. Not for in-memory or local datasets. + - 'tensor_db': Managed Tensor Database for storage and query. + Only for data in Deep Lake Managed Database. + Use `runtime = {"db_engine": True}` during dataset creation. Returns: - List of Documents most similar to the query vector. + List[Document]: List of Documents most similar to the query vector. """ - return self._search_helper(query=query, k=k, **kwargs) + + return self._search( + query=query, + k=k, + use_maximal_marginal_relevance=False, + return_score=False, + **kwargs, + ) def similarity_search_by_vector( - self, embedding: List[float], k: int = 4, **kwargs: Any + self, + embedding: Union[List[float], np.ndarray], + k: int = 4, + **kwargs: Any, ) -> List[Document]: - """Return docs most similar to embedding vector. + """ + Return docs most similar to embedding vector. + + Examples: + >>> # Search using an embedding + >>> data = vector_store.similarity_search_by_vector( + ... embedding=, + ... k=, + ... exec_option=, + ... ) Args: - embedding: Embedding to look up documents similar to. - k: Number of Documents to return. Defaults to 4. + embedding (Union[List[float], np.ndarray]): + Embedding to find similar docs. + k (int): Number of Documents to return. Defaults to 4. + **kwargs: Additional keyword arguments including: + filter (Union[Dict, Callable], optional): + Additional filter before embedding search. + - ``Dict`` - Key-value search on tensors of htype json. True + if all key-value filters are satisfied. + Dict = {"tensor_name_1": {"key": value}, + "tensor_name_2": {"key": value}} + - ``Function`` - Any function compatible with + `deeplake.filter`. + Defaults to None. + exec_option (str): Options for search execution include + "python", "compute_engine", or "tensor_db". Defaults to + "python". + - "python" - Pure-python implementation running on the client. + Can be used for data stored anywhere. WARNING: using this + option with big datasets is discouraged due to potential + memory issues. + - "compute_engine" - Performant C++ implementation of the Deep + Lake Compute Engine. Runs on the client and can be used for + any data stored in or connected to Deep Lake. It cannot be + used with in-memory or local datasets. + - "tensor_db" - Performant, fully-hosted Managed Tensor Database. + Responsible for storage and query execution. Only available + for data stored in the Deep Lake Managed Database. + To store datasets in this database, specify + `runtime = {"db_engine": True}` during dataset creation. + distance_metric (str): `L2` for Euclidean, `L1` for Nuclear, + `max` for L-infinity distance, `cos` for cosine similarity, + 'dot' for dot product. Defaults to `L2`. Returns: - List of Documents most similar to the query vector. + List[Document]: List of Documents most similar to the query vector. """ - return self._search_helper(embedding=embedding, k=k, **kwargs) + + return self._search( + embedding=embedding, + k=k, + use_maximal_marginal_relevance=False, + return_score=False, + **kwargs, + ) def similarity_search_with_score( self, query: str, - distance_metric: str = "L2", k: int = 4, - filter: Optional[Dict[str, str]] = None, + **kwargs: Any, ) -> List[Tuple[Document, float]]: - """Run similarity search with Deep Lake with distance returned. + """ + Run similarity search with Deep Lake with distance returned. + + Examples: + >>> data = vector_store.similarity_search_with_score( + ... query=, + ... embedding= + ... k=, + ... exec_option=, + ... ) Args: query (str): Query text to search for. - distance_metric: `L2` for Euclidean, `L1` for Nuclear, `max` L-infinity - distance, `cos` for cosine similarity, 'dot' for dot product. - Defaults to `L2`. k (int): Number of results to return. Defaults to 4. - filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + **kwargs: Additional keyword arguments. Some of these arguments are: + distance_metric: `L2` for Euclidean, `L1` for Nuclear, `max` L-infinity + distance, `cos` for cosine similarity, 'dot' for dot product. + Defaults to `L2`. + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + embedding_function (Callable): Embedding function to use. Defaults + to None. + exec_option (str): DeepLakeVectorStore supports 3 ways to perform + searching. It could be either "python", "compute_engine" or + "tensor_db". Defaults to "python". + - "python" - Pure-python implementation running on the client. + Can be used for data stored anywhere. WARNING: using this + option with big datasets is discouraged due to potential + memory issues. + - "compute_engine" - Performant C++ implementation of the Deep + Lake Compute Engine. Runs on the client and can be used for + any data stored in or connected to Deep Lake. It cannot be used + with in-memory or local datasets. + - "tensor_db" - Performant, fully-hosted Managed Tensor Database. + Responsible for storage and query execution. Only available for + data stored in the Deep Lake Managed Database. To store datasets + in this database, specify `runtime = {"db_engine": True}` + during dataset creation. Returns: List[Tuple[Document, float]]: List of documents most similar to the query - text with distance in float. - """ - return self._search_helper( + text with distance in float.""" + + return self._search( query=query, k=k, - filter=filter, return_score=True, - distance_metric=distance_metric, + **kwargs, ) def max_marginal_relevance_search_by_vector( @@ -432,30 +546,56 @@ class DeepLake(VectorStore): k: int = 4, fetch_k: int = 20, lambda_mult: float = 0.5, + exec_option: Optional[str] = None, **kwargs: Any, ) -> List[Document]: - """Return docs selected using the maximal marginal relevance. - Maximal marginal relevance optimizes for similarity to query AND diversity - among selected documents. + """ + Return docs selected using the maximal marginal relevance. Maximal marginal + relevance optimizes for similarity to query AND diversity among selected docs. + + Examples: + >>> data = vector_store.max_marginal_relevance_search_by_vector( + ... embedding=, + ... fetch_k=, + ... k=, + ... exec_option=, + ... ) Args: embedding: Embedding to look up documents similar to. k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch to pass to MMR algorithm. - lambda_mult: Number between 0 and 1 that determines the degree - of diversity among the results with 0 corresponding - to maximum diversity and 1 to minimum diversity. - Defaults to 0.5. + fetch_k: Number of Documents to fetch for MMR algorithm. + lambda_mult: Number between 0 and 1 determining the degree of diversity. + 0 corresponds to max diversity and 1 to min diversity. Defaults to 0.5. + exec_option (str): DeepLakeVectorStore supports 3 ways for searching. + Could be "python", "compute_engine" or "tensor_db". Defaults to + "python". + - "python" - Pure-python implementation running on the client. + Can be used for data stored anywhere. WARNING: using this + option with big datasets is discouraged due to potential + memory issues. + - "compute_engine" - Performant C++ implementation of the Deep + Lake Compute Engine. Runs on the client and can be used for + any data stored in or connected to Deep Lake. It cannot be used + with in-memory or local datasets. + - "tensor_db" - Performant, fully-hosted Managed Tensor Database. + Responsible for storage and query execution. Only available for + data stored in the Deep Lake Managed Database. To store datasets + in this database, specify `runtime = {"db_engine": True}` + during dataset creation. + **kwargs: Additional keyword arguments. Returns: - List of Documents selected by maximal marginal relevance. + List[Documents] - A list of documents. """ - return self._search_helper( + + return self._search( embedding=embedding, k=k, fetch_k=fetch_k, use_maximal_marginal_relevance=True, lambda_mult=lambda_mult, + exec_option=exec_option, **kwargs, ) @@ -465,32 +605,67 @@ class DeepLake(VectorStore): k: int = 4, fetch_k: int = 20, lambda_mult: float = 0.5, + exec_option: Optional[str] = None, **kwargs: Any, ) -> List[Document]: - """Return docs selected using the maximal marginal relevance. + """Return docs selected using maximal marginal relevance. + Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents. + + Examples: + >>> # Search using an embedding + >>> data = vector_store.max_marginal_relevance_search( + ... query = , + ... embedding_function = , + ... k = , + ... exec_option = , + ... ) + Args: query: Text to look up documents similar to. k: Number of Documents to return. Defaults to 4. - fetch_k: Number of Documents to fetch to pass to MMR algorithm. - lambda_mult: Number between 0 and 1 that determines the degree - of diversity among the results with 0 corresponding - to maximum diversity and 1 to minimum diversity. + fetch_k: Number of Documents for MMR algorithm. + lambda_mult: Value between 0 and 1. 0 corresponds + to maximum diversity and 1 to minimum. Defaults to 0.5. + exec_option (str): Supports 3 ways to perform searching. + - "python" - Pure-python implementation running on the client. + Can be used for data stored anywhere. WARNING: using this + option with big datasets is discouraged due to potential + memory issues. + - "compute_engine" - Performant C++ implementation of the Deep + Lake Compute Engine. Runs on the client and can be used for + any data stored in or connected to Deep Lake. It cannot be + used with in-memory or local datasets. + - "tensor_db" - Performant, fully-hosted Managed Tensor Database. + Responsible for storage and query execution. Only available + for data stored in the Deep Lake Managed Database. To store + datasets in this database, specify + `runtime = {"db_engine": True}` during dataset creation. + **kwargs: Additional keyword arguments + Returns: List of Documents selected by maximal marginal relevance. + + Raises: + ValueError: when MRR search is on but embedding function is + not specified. """ - if self._embedding_function is None: + embedding_function = kwargs.get("embedding") or self._embedding_function + if embedding_function is None: raise ValueError( - "For MMR search, you must specify an embedding function on" "creation." + "For MMR search, you must specify an embedding function on" + " `creation` or during add call." ) - return self._search_helper( + return self._search( query=query, k=k, fetch_k=fetch_k, use_maximal_marginal_relevance=True, lambda_mult=lambda_mult, + exec_option=exec_option, + embedding_function=embedding_function, # type: ignore **kwargs, ) @@ -509,8 +684,17 @@ class DeepLake(VectorStore): If a dataset_path is specified, the dataset will be persisted in that location, otherwise by default at `./deeplake` + Examples: + >>> # Search using an embedding + >>> vector_store = DeepLake.from_texts( + ... texts = , + ... embedding_function = , + ... k = , + ... exec_option = , + ... ) + Args: - path (str, pathlib.Path): - The full path to the dataset. Can be: + dataset_path (str): - The full path to the dataset. Can be: - Deep Lake cloud path of the form ``hub://username/dataset_name``. To write to Deep Lake cloud datasets, ensure that you are logged in to Deep Lake @@ -525,18 +709,35 @@ class DeepLake(VectorStore): - In-memory path of the form ``mem://path/to/dataset`` which doesn't save the dataset, but keeps it in memory instead. Should be used only for testing as it does not persist. - documents (List[Document]): List of documents to add. + texts (List[Document]): List of documents to add. embedding (Optional[Embeddings]): Embedding function. Defaults to None. + Note, in other places, it is called embedding_function. metadatas (Optional[List[dict]]): List of metadatas. Defaults to None. ids (Optional[List[str]]): List of document IDs. Defaults to None. + **kwargs: Additional keyword arguments. Returns: DeepLake: Deep Lake dataset. + + Raises: + ValueError: If 'embedding' is provided in kwargs. This is deprecated, + please use `embedding_function` instead. """ + if kwargs.get("embedding"): + raise ValueError( + "using embedding as embedidng_functions is deprecated. " + "Please use `embedding_function` instead." + ) + deeplake_dataset = cls( dataset_path=dataset_path, embedding_function=embedding, **kwargs ) - deeplake_dataset.add_texts(texts=texts, metadatas=metadatas, ids=ids) + deeplake_dataset.add_texts( + texts=texts, + metadatas=metadatas, + ids=ids, + embedding_function=embedding.embed_documents, # type: ignore + ) return deeplake_dataset def delete( @@ -545,7 +746,7 @@ class DeepLake(VectorStore): filter: Any[Dict[str, str], None] = None, delete_all: Any[bool, None] = None, ) -> bool: - """Delete the entities in the dataset + """Delete the entities in the dataset. Args: ids (Optional[List[str]], optional): The document_ids to delete. @@ -554,33 +755,29 @@ class DeepLake(VectorStore): Defaults to None. delete_all (Optional[bool], optional): Whether to drop the dataset. Defaults to None. + + Returns: + bool: Whether the delete operation was successful. """ - if delete_all: - self.ds.delete(large_ok=True) - return True - - view = None - if ids: - view = self.ds.filter(lambda x: x["ids"].data()["value"] in ids) - ids = list(view.sample_indices) - - if filter: - if view is None: - view = self.ds - view = view.filter(partial(dp_filter, filter=filter)) - ids = list(view.sample_indices) - - with self.ds: - for id in sorted(ids)[::-1]: - self.ds.pop(id) - - self.ds.commit(f"deleted {len(ids)} samples", allow_empty=True) + self.vectorstore.delete( + ids=ids, + filter=filter, + delete_all=delete_all, + ) return True @classmethod def force_delete_by_path(cls, path: str) -> None: - """Force delete dataset by path""" + """Force delete dataset by path. + + Args: + path (str): path of the dataset to delete. + + Raises: + ValueError: if deeplake is not installed. + """ + try: import deeplake except ImportError: @@ -593,7 +790,3 @@ class DeepLake(VectorStore): def delete_dataset(self) -> None: """Delete the collection.""" self.delete(delete_all=True) - - def persist(self) -> None: - """Persist the collection.""" - self.ds.flush() diff --git a/poetry.lock b/poetry.lock index 5652b4f3..c8e6295f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1762,13 +1762,13 @@ files = [ [[package]] name = "deeplake" -version = "3.6.1" +version = "3.6.4" description = "Activeloop Deep Lake" category = "main" optional = false python-versions = "*" files = [ - {file = "deeplake-3.6.1.tar.gz", hash = "sha256:78b0280e3e21c6731a96a9a2519a24e767df708c309e934ab473dfbc17b13581"}, + {file = "deeplake-3.6.4.tar.gz", hash = "sha256:cd31e772e00012a0cfdcd9d2cfe0e7fa3eaea1de122e98ae4f54e15d2722b94f"}, ] [package.dependencies] @@ -1785,12 +1785,12 @@ pyjwt = "*" tqdm = "*" [package.extras] -all = ["IPython", "av (>=8.1.0)", "azure-cli", "azure-identity", "azure-storage-blob", "flask", "google-api-python-client (>=2.31.0,<2.32.0)", "google-auth (>=2.0.1,<2.1.0)", "google-auth-oauthlib (>=0.4.5,<0.5.0)", "google-cloud-storage (>=1.42.0,<1.43.0)", "laspy", "libdeeplake (==0.0.56)", "nibabel", "oauth2client (>=4.1.3,<4.2.0)", "pydicom"] +all = ["IPython", "av (>=8.1.0)", "azure-cli", "azure-identity", "azure-storage-blob", "flask", "google-api-python-client (>=2.31.0,<2.32.0)", "google-auth (>=2.0.1,<2.1.0)", "google-auth-oauthlib (>=0.4.5,<0.5.0)", "google-cloud-storage (>=1.42.0,<1.43.0)", "laspy", "libdeeplake (==0.0.59)", "nibabel", "oauth2client (>=4.1.3,<4.2.0)", "pydicom"] audio = ["av (>=8.1.0)"] av = ["av (>=8.1.0)"] azure = ["azure-cli", "azure-identity", "azure-storage-blob"] dicom = ["nibabel", "pydicom"] -enterprise = ["libdeeplake (==0.0.56)", "pyjwt"] +enterprise = ["libdeeplake (==0.0.59)", "pyjwt"] gcp = ["google-auth (>=2.0.1,<2.1.0)", "google-auth-oauthlib (>=0.4.5,<0.5.0)", "google-cloud-storage (>=1.42.0,<1.43.0)"] gdrive = ["google-api-python-client (>=2.31.0,<2.32.0)", "google-auth (>=2.0.1,<2.1.0)", "google-auth-oauthlib (>=0.4.5,<0.5.0)", "oauth2client (>=4.1.3,<4.2.0)"] medical = ["nibabel", "pydicom"] @@ -11486,4 +11486,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "dd54bb9201b260b734ceebad2c6629b4b1b3bf224731de092782678219f32120" +content-hash = "71afe34849d7dc7189fe79e795b12a48d5e185ea2146880ba80fc6770b6e6272" diff --git a/pyproject.toml b/pyproject.toml index 088e0ec2..4f0bce6a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -59,7 +59,7 @@ arxiv = {version = "^1.4", optional = true} pypdf = {version = "^3.4.0", optional = true} networkx = {version="^2.6.3", optional = true} aleph-alpha-client = {version="^2.15.0", optional = true} -deeplake = {version = "^3.3.0", optional = true} +deeplake = {version = "^3.6.2", optional = true} pgvector = {version = "^0.1.6", optional = true} psycopg2-binary = {version = "^2.9.5", optional = true} pyowm = {version = "^3.3.0", optional = true} diff --git a/tests/integration_tests/vectorstores/test_deeplake.py b/tests/integration_tests/vectorstores/test_deeplake.py index 634d5237..823cbef8 100644 --- a/tests/integration_tests/vectorstores/test_deeplake.py +++ b/tests/integration_tests/vectorstores/test_deeplake.py @@ -66,8 +66,6 @@ def test_deeplakewith_persistence() -> None: output = docsearch.similarity_search("foo", k=1) assert output == [Document(page_content="foo")] - docsearch.persist() - # Get a new VectorStore from the persisted directory docsearch = DeepLake( dataset_path=dataset_path, @@ -98,8 +96,6 @@ def test_deeplake_overwrite_flag() -> None: output = docsearch.similarity_search("foo", k=1) assert output == [Document(page_content="foo")] - docsearch.persist() - # Get a new VectorStore from the persisted directory, with no overwrite (implicit) docsearch = DeepLake( dataset_path=dataset_path, @@ -125,9 +121,8 @@ def test_deeplake_overwrite_flag() -> None: embedding_function=FakeEmbeddings(), overwrite=True, ) - output = docsearch.similarity_search("foo", k=1) - # assert page no longer present - assert output == [] + with pytest.raises(ValueError): + output = docsearch.similarity_search("foo", k=1) def test_similarity_search(deeplake_datastore: DeepLake, distance_metric: str) -> None: @@ -172,7 +167,10 @@ def test_similarity_search_with_filter( """Test similarity search.""" output = deeplake_datastore.similarity_search( - "foo", k=1, distance_metric=distance_metric, filter={"page": "1"} + "foo", + k=1, + distance_metric=distance_metric, + filter={"metadata": {"page": "1"}}, ) assert output == [Document(page_content="bar", metadata={"page": "1"})] deeplake_datastore.delete_dataset() @@ -196,19 +194,29 @@ def test_max_marginal_relevance_search(deeplake_datastore: DeepLake) -> None: def test_delete_dataset_by_ids(deeplake_datastore: DeepLake) -> None: """Test delete dataset.""" - id = deeplake_datastore.ds.ids.data()["value"][0] + id = deeplake_datastore.vectorstore.dataset.id.data()["value"][0] deeplake_datastore.delete(ids=[id]) - assert deeplake_datastore.similarity_search("foo", k=1, filter={"page": "0"}) == [] - assert len(deeplake_datastore.ds) == 2 + assert ( + deeplake_datastore.similarity_search( + "foo", k=1, filter={"metadata": {"page": "0"}} + ) + == [] + ) + assert len(deeplake_datastore.vectorstore) == 2 deeplake_datastore.delete_dataset() def test_delete_dataset_by_filter(deeplake_datastore: DeepLake) -> None: """Test delete dataset.""" - deeplake_datastore.delete(filter={"page": "1"}) - assert deeplake_datastore.similarity_search("bar", k=1, filter={"page": "1"}) == [] - assert len(deeplake_datastore.ds) == 2 + deeplake_datastore.delete(filter={"metadata": {"page": "1"}}) + assert ( + deeplake_datastore.similarity_search( + "bar", k=1, filter={"metadata": {"page": "1"}} + ) + == [] + ) + assert len(deeplake_datastore.vectorstore.dataset) == 2 deeplake_datastore.delete_dataset()