From 2c0023393b66374fc1d28941872381e4a3849dec Mon Sep 17 00:00:00 2001 From: Davit Buniatyan Date: Sun, 23 Apr 2023 21:23:54 -0700 Subject: [PATCH] Deep Lake mini upgrades (#3375) Improvements * set default num_workers for ingestion to 0 * upgraded notebooks for avoiding dataset creation ambiguity * added `force_delete_dataset_by_path` * bumped deeplake to 3.3.0 * creds arg passing to deeplake object that would allow custom S3 Notes * please double check if poetry is not messed up (thanks!) Asks * Would be great to create a shared slack channel for quick questions --------- Co-authored-by: Davit Buniatyan --- .../vectorstores/examples/deeplake.ipynb | 484 ++++++++++++++---- ...tter-the-algorithm-analysis-deeplake.ipynb | 84 +-- .../semantic-search-over-chat.ipynb | 2 +- langchain/vectorstores/deeplake.py | 41 +- poetry.lock | 19 +- pyproject.toml | 2 +- .../vectorstores/test_deeplake.py | 7 + 7 files changed, 475 insertions(+), 164 deletions(-) diff --git a/docs/modules/indexes/vectorstores/examples/deeplake.ipynb b/docs/modules/indexes/vectorstores/examples/deeplake.ipynb index ff0a5a48..0c4634c8 100644 --- a/docs/modules/indexes/vectorstores/examples/deeplake.ipynb +++ b/docs/modules/indexes/vectorstores/examples/deeplake.ipynb @@ -22,7 +22,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 45, "metadata": {}, "outputs": [], "source": [ @@ -33,7 +33,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 46, "metadata": {}, "outputs": [], "source": [ @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -60,16 +60,24 @@ "embeddings = OpenAIEmbeddings()" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Creates a dataset locally at `./deeplake/`, then runs similiarity search " + ] + }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 49, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "mem://langchain loaded successfully.\n" + "./my_deeplake/ loaded successfully.\n" ] }, { @@ -83,7 +91,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Dataset(path='mem://langchain', tensors=['embedding', 'ids', 'metadata', 'text'])\n", + "Dataset(path='./my_deeplake/', tensors=['embedding', 'ids', 'metadata', 'text'])\n", "\n", " tensor htype shape dtype compression\n", " ------- ------- ------- ------- ------- \n", @@ -95,15 +103,17 @@ } ], "source": [ - "db = DeepLake.from_documents(docs, embeddings)\n", - "\n", + "db = DeepLake(dataset_path=\"./my_deeplake/\", embedding_function=embeddings, overwrite=True)\n", + "db.add_documents(docs)\n", + "# or shorter\n", + "# db = DeepLake.from_documents(docs, dataset_path=\"./my_deeplake/\", embedding=embeddings, overwrite=True)\n", "query = \"What did the president say about Ketanji Brown Jackson\"\n", "docs = db.similarity_search(query)" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 50, "metadata": {}, "outputs": [ { @@ -124,6 +134,62 @@ "print(docs[0].page_content)" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Later, you can reload the dataset without recomputing embeddings" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "./my_deeplake/ loaded successfully.\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Deep Lake Dataset in ./my_deeplake/ already exists, loading from the storage\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset(path='./my_deeplake/', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])\n", + "\n", + " tensor htype shape dtype compression\n", + " ------- ------- ------- ------- ------- \n", + " embedding generic (4, 1536) float32 None \n", + " ids text (4, 1) str None \n", + " metadata json (4, 1) str None \n", + " text text (4, 1) str None \n" + ] + } + ], + "source": [ + "db = DeepLake(dataset_path=\"./my_deeplake/\", embedding_function=embeddings, read_only=True)\n", + "docs = db.similarity_search(query)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Deep Lake, for now, is single writer and multiple reader. Setting `read_only=True` helps to avoid acquring the writer lock." + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -134,14 +200,14 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/media/sdb/davit/.local/lib/python3.10/site-packages/langchain/llms/openai.py:624: UserWarning: You are trying to use a chat model. This way of initializing it is no longer supported. Instead, please use: `from langchain.chat_models import ChatOpenAI`\n", + "/media/sdb/davit/Git/experiments/langchain/langchain/llms/openai.py:672: UserWarning: You are trying to use a chat model. This way of initializing it is no longer supported. Instead, please use: `from langchain.chat_models import ChatOpenAI`\n", " warnings.warn(\n" ] } @@ -155,16 +221,16 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'The president nominated Circuit Court of Appeals Judge Ketanji Brown Jackson for the United States Supreme Court and praised her qualifications and broad support from both Democrats and Republicans.'" + "\"The president nominated Ketanji Brown Jackson to serve on the United States Supreme Court, describing her as one of the nation's top legal minds and a consensus builder with a background in private practice and public defense, and noting that she has received broad support from both Democrats and Republicans.\"" ] }, - "execution_count": 10, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } @@ -184,14 +250,14 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 54, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "mem://langchain loaded successfully.\n" + "./my_deeplake/ loaded successfully.\n" ] }, { @@ -205,14 +271,14 @@ "name": "stdout", "output_type": "stream", "text": [ - "Dataset(path='mem://langchain', tensors=['embedding', 'ids', 'metadata', 'text'])\n", + "Dataset(path='./my_deeplake/', tensors=['embedding', 'ids', 'metadata', 'text'])\n", "\n", - " tensor htype shape dtype compression\n", - " ------- ------- ------- ------- ------- \n", - " embedding generic (42, 1536) float32 None \n", - " ids text (42, 1) str None \n", - " metadata json (42, 1) str None \n", - " text text (42, 1) str None \n" + " tensor htype shape dtype compression\n", + " ------- ------- ------- ------- ------- \n", + " embedding generic (4, 1536) float32 None \n", + " ids text (4, 1) str None \n", + " metadata json (4, 1) str None \n", + " text text (4, 1) str None \n" ] }, { @@ -227,31 +293,29 @@ "for d in docs:\n", " d.metadata['year'] = random.randint(2012, 2014)\n", "\n", - "db = DeepLake.from_documents(docs, embeddings)" + "db = DeepLake.from_documents(docs, embeddings, dataset_path=\"./my_deeplake/\", overwrite=True)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "100%|██████████| 42/42 [00:00<00:00, 3456.17it/s]\n" + "100%|██████████| 4/4 [00:00<00:00, 1080.24it/s]\n" ] }, { "data": { "text/plain": [ - "[Document(page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013}),\n", - " Document(page_content='And for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \\n\\nAs I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \\n\\nWhile it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \\n\\nAnd soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \\n\\nSo tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together. \\n\\nFirst, beat the opioid epidemic.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013}),\n", - " Document(page_content='Vice President Harris and I ran for office with a new economic vision for America. \\n\\nInvest in America. Educate Americans. Grow the workforce. Build the economy from the bottom up \\nand the middle out, not from the top down. \\n\\nBecause we know that when the middle class grows, the poor have a ladder up and the wealthy do very well. \\n\\nAmerica used to have the best roads, bridges, and airports on Earth. \\n\\nNow our infrastructure is ranked 13th in the world. \\n\\nWe won’t be able to compete for the jobs of the 21st Century if we don’t fix that. \\n\\nThat’s why it was so important to pass the Bipartisan Infrastructure Law—the most sweeping investment to rebuild America in history. \\n\\nThis was a bipartisan effort, and I want to thank the members of both parties who worked to make it happen. \\n\\nWe’re done talking about infrastructure weeks. \\n\\nWe’re going to have an infrastructure decade.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013}),\n", - " Document(page_content='It is going to transform America and put us on a path to win the economic competition of the 21st Century that we face with the rest of the world—particularly with China. \\n\\nAs I’ve told Xi Jinping, it is never a good bet to bet against the American people. \\n\\nWe’ll create good jobs for millions of Americans, modernizing roads, airports, ports, and waterways all across America. \\n\\nAnd we’ll do it all to withstand the devastating effects of the climate crisis and promote environmental justice. \\n\\nWe’ll build a national network of 500,000 electric vehicle charging stations, begin to replace poisonous lead pipes—so every child—and every American—has clean water to drink at home and at school, provide affordable high-speed internet for every American—urban, suburban, rural, and tribal communities. \\n\\n4,000 projects have already been announced. \\n\\nAnd tonight, I’m announcing that this year we will start fixing over 65,000 miles of highway and 1,500 bridges in disrepair.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013})]" + "[Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013}),\n", + " Document(page_content='And for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \\n\\nAs I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \\n\\nWhile it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \\n\\nAnd soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \\n\\nSo tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together. \\n\\nFirst, beat the opioid epidemic.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013})]" ] }, - "execution_count": 12, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } @@ -271,19 +335,19 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2012}),\n", - " Document(page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013}),\n", + "[Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013}),\n", + " Document(page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2012}),\n", " Document(page_content='And for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \\n\\nAs I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \\n\\nWhile it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \\n\\nAnd soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \\n\\nSo tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together. \\n\\nFirst, beat the opioid epidemic.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013}),\n", - " Document(page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers. \\n\\nAnd as Wall Street firms take over more nursing homes, quality in those homes has gone down and costs have gone up. \\n\\nThat ends on my watch. \\n\\nMedicare is going to set higher standards for nursing homes and make sure your loved ones get the care they deserve and expect. \\n\\nWe’ll also cut costs and keep the economy going strong by giving workers a fair shot, provide more training and apprenticeships, hire them based on their skills not degrees. \\n\\nLet’s pass the Paycheck Fairness Act and paid leave. \\n\\nRaise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. \\n\\nLet’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community colleges.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2014})]" + " Document(page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers. \\n\\nAnd as Wall Street firms take over more nursing homes, quality in those homes has gone down and costs have gone up. \\n\\nThat ends on my watch. \\n\\nMedicare is going to set higher standards for nursing homes and make sure your loved ones get the care they deserve and expect. \\n\\nWe’ll also cut costs and keep the economy going strong by giving workers a fair shot, provide more training and apprenticeships, hire them based on their skills not degrees. \\n\\nLet’s pass the Paycheck Fairness Act and paid leave. \\n\\nRaise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. \\n\\nLet’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community colleges.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2012})]" ] }, - "execution_count": 13, + "execution_count": 56, "metadata": {}, "output_type": "execute_result" } @@ -303,19 +367,19 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "[Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2012}),\n", - " Document(page_content='One was stationed at bases and breathing in toxic smoke from “burn pits” that incinerated wastes of war—medical and hazard material, jet fuel, and more. \\n\\nWhen they came home, many of the world’s fittest and best trained warriors were never the same. \\n\\nHeadaches. Numbness. Dizziness. \\n\\nA cancer that would put them in a flag-draped coffin. \\n\\nI know. \\n\\nOne of those soldiers was my son Major Beau Biden. \\n\\nWe don’t know for sure if a burn pit was the cause of his brain cancer, or the diseases of so many of our troops. \\n\\nBut I’m committed to finding out everything we can. \\n\\nCommitted to military families like Danielle Robinson from Ohio. \\n\\nThe widow of Sergeant First Class Heath Robinson. \\n\\nHe was born a soldier. Army National Guard. Combat medic in Kosovo and Iraq. \\n\\nStationed near Baghdad, just yards from burn pits the size of football fields. \\n\\nHeath’s widow Danielle is here with us tonight. They loved going to Ohio State football games. He loved building Legos with their daughter.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2014}),\n", - " Document(page_content='As Ohio Senator Sherrod Brown says, “It’s time to bury the label “Rust Belt.” \\n\\nIt’s time. \\n\\nBut with all the bright spots in our economy, record job growth and higher wages, too many families are struggling to keep up with the bills. \\n\\nInflation is robbing them of the gains they might otherwise feel. \\n\\nI get it. That’s why my top priority is getting prices under control. \\n\\nLook, our economy roared back faster than most predicted, but the pandemic meant that businesses had a hard time hiring enough workers to keep up production in their factories. \\n\\nThe pandemic also disrupted global supply chains. \\n\\nWhen factories close, it takes longer to make goods and get them from the warehouse to the store, and prices go up. \\n\\nLook at cars. \\n\\nLast year, there weren’t enough semiconductors to make all the cars that people wanted to buy. \\n\\nAnd guess what, prices of automobiles went up. \\n\\nSo—we have a choice. \\n\\nOne way to fight inflation is to drive down wages and make Americans poorer.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2012}),\n", - " Document(page_content='We can’t change how divided we’ve been. But we can change how we move forward—on COVID-19 and other issues we must face together. \\n\\nI recently visited the New York City Police Department days after the funerals of Officer Wilbert Mora and his partner, Officer Jason Rivera. \\n\\nThey were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. \\n\\nOfficer Mora was 27 years old. \\n\\nOfficer Rivera was 22. \\n\\nBoth Dominican Americans who’d grown up on the same streets they later chose to patrol as police officers. \\n\\nI spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves. \\n\\nI’ve worked on these issues a long time. \\n\\nI know what works: Investing in crime preventionand community police officers who’ll walk the beat, who’ll know the neighborhood, and who can restore trust and safety.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2012})]" + "[Document(page_content='Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013}),\n", + " Document(page_content='Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers. \\n\\nAnd as Wall Street firms take over more nursing homes, quality in those homes has gone down and costs have gone up. \\n\\nThat ends on my watch. \\n\\nMedicare is going to set higher standards for nursing homes and make sure your loved ones get the care they deserve and expect. \\n\\nWe’ll also cut costs and keep the economy going strong by giving workers a fair shot, provide more training and apprenticeships, hire them based on their skills not degrees. \\n\\nLet’s pass the Paycheck Fairness Act and paid leave. \\n\\nRaise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. \\n\\nLet’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community colleges.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2012}),\n", + " Document(page_content='A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \\n\\nAnd if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \\n\\nWe can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \\n\\nWe’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \\n\\nWe’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \\n\\nWe’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2012}),\n", + " Document(page_content='And for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \\n\\nAs I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \\n\\nWhile it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \\n\\nAnd soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \\n\\nSo tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together. \\n\\nFirst, beat the opioid epidemic.', metadata={'source': '../../../state_of_the_union.txt', 'year': 2013})]" ] }, - "execution_count": 14, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } @@ -324,6 +388,46 @@ "db.max_marginal_relevance_search('What did the president say about Ketanji Brown Jackson?')" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "db.delete_dataset()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "and if delete fails you can also force delete" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [] + } + ], + "source": [ + "DeepLake.force_delete_by_path(\"./my_deeplake\")" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -335,7 +439,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ @@ -344,7 +448,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 63, "metadata": {}, "outputs": [ { @@ -352,57 +456,122 @@ "output_type": "stream", "text": [ "Your Deep Lake dataset has been successfully created!\n", - "The dataset is private so make sure you are logged in!\n" + "The dataset is private so make sure you are logged in!\n", + "This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/davitbun/langchain_test\n", + "hub://davitbun/langchain_test loaded successfully.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "\\" + "Evaluating ingest: 100%|██████████| 1/1 [00:14<00:00\n", + " \r" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/davitbun/linkedin\n" + "Dataset(path='hub://davitbun/langchain_test', tensors=['embedding', 'ids', 'metadata', 'text'])\n", + "\n", + " tensor htype shape dtype compression\n", + " ------- ------- ------- ------- ------- \n", + " embedding generic (4, 1536) float32 None \n", + " ids text (4, 1) str None \n", + " metadata json (4, 1) str None \n", + " text text (4, 1) str None \n" ] }, { - "name": "stderr", + "data": { + "text/plain": [ + "['d6d6ccb4-e187-11ed-b66d-41c5f7b85421',\n", + " 'd6d6ccb5-e187-11ed-b66d-41c5f7b85421',\n", + " 'd6d6ccb6-e187-11ed-b66d-41c5f7b85421',\n", + " 'd6d6ccb7-e187-11ed-b66d-41c5f7b85421']" + ] + }, + "execution_count": 63, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Embed and store the texts\n", + "username = \"\" # your username on app.activeloop.ai \n", + "dataset_path = f\"hub://{username}/langchain_test\" # could be also ./local/path (much faster locally), s3://bucket/path/to/dataset, gcs://path/to/dataset, etc.\n", + "\n", + "embedding = OpenAIEmbeddings()\n", + "db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings, overwrite=True)\n", + "db.add_documents(docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "name": "stdout", "output_type": "stream", "text": [ - " \r" + "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "\n", + "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "\n", + "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "\n", + "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n" ] - }, + } + ], + "source": [ + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "docs = db.similarity_search(query)\n", + "print(docs[0].page_content)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating dataset on AWS S3" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "hub://davitbun/linkedin loaded successfully.\n" + "s3://hub-2.0-datasets-n/langchain_test loaded successfully.\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ - "Evaluating ingest: 100%|██████████| 1/1 [00:23<00:00\n", - "/" + "Evaluating ingest: 100%|██████████| 1/1 [00:10<00:00\n", + "\\" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "Dataset(path='hub://davitbun/linkedin', tensors=['embedding', 'ids', 'metadata', 'text'])\n", + "Dataset(path='s3://hub-2.0-datasets-n/langchain_test', tensors=['embedding', 'ids', 'metadata', 'text'])\n", "\n", - " tensor htype shape dtype compression\n", - " ------- ------- ------- ------- ------- \n", - " embedding generic (42, 1536) float32 None \n", - " ids text (42, 1) str None \n", - " metadata json (42, 1) str None \n", - " text text (42, 1) str None \n" + " tensor htype shape dtype compression\n", + " ------- ------- ------- ------- ------- \n", + " embedding generic (4, 1536) float32 None \n", + " ids text (4, 1) str None \n", + " metadata json (4, 1) str None \n", + " text text (4, 1) str None \n" ] }, { @@ -414,69 +583,212 @@ } ], "source": [ - "# Embed and store the texts\n", - "dataset_path = f\"hub://{USERNAME}/{DATASET_NAME}\" # could be also ./local/path (much faster locally), s3://bucket/path/to/dataset, gcs://path/to/dataset, etc.\n", + "dataset_path = f\"s3://BUCKET/langchain_test\" # could be also ./local/path (much faster locally), hub://bucket/path/to/dataset, gcs://path/to/dataset, etc.\n", "\n", "embedding = OpenAIEmbeddings()\n", - "vectordb = DeepLake.from_documents(documents=docs, embedding=embedding, dataset_path=dataset_path)" + "db = DeepLake.from_documents(docs, dataset_path=dataset_path, embedding=embeddings, overwrite=True, creds = {\n", + " 'aws_access_key_id': os.environ['AWS_ACCESS_KEY_ID'], \n", + " 'aws_secret_access_key': os.environ['AWS_SECRET_ACCESS_KEY'], \n", + " 'aws_session_token': os.environ['AWS_SESSION_TOKEN'], # Optional\n", + "})" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Deep Lake API\n", + "you can access the Deep Lake dataset at `db.ds`" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 66, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", - "\n", - "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", - "\n", - "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "Dataset(path='hub://davitbun/langchain_test', tensors=['embedding', 'ids', 'metadata', 'text'])\n", "\n", - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n" + " tensor htype shape dtype compression\n", + " ------- ------- ------- ------- ------- \n", + " embedding generic (4, 1536) float32 None \n", + " ids text (4, 1) str None \n", + " metadata json (4, 1) str None \n", + " text text (4, 1) str None \n" ] } ], "source": [ - "query = \"What did the president say about Ketanji Brown Jackson\"\n", - "docs = db.similarity_search(query)\n", - "print(docs[0].page_content)" + "# get structure of the dataset\n", + "db.ds.summary()" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 67, + "metadata": {}, + "outputs": [], + "source": [ + "# get embeddings numpy array\n", + "embeds = db.ds.embedding.numpy()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Transfer local dataset to cloud\n", + "Copy already created dataset to the cloud. You can also transfer from cloud to local." + ] + }, + { + "cell_type": "code", + "execution_count": 73, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Copying dataset: 100%|██████████| 56/56 [00:38<00:00\n" + ] + }, { "name": "stdout", "output_type": "stream", "text": [ - "Dataset(path='hub://davitbun/linkedin', tensors=['embedding', 'ids', 'metadata', 'text'])\n", - "\n", - " tensor htype shape dtype compression\n", - " ------- ------- ------- ------- ------- \n", - " embedding generic (42, 1536) float32 None \n", - " ids text (42, 1) str None \n", - " metadata json (42, 1) str None \n", - " text text (42, 1) str None \n" + "This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/davitbun/langchain_test_copy\n", + "Your Deep Lake dataset has been successfully created!\n", + "The dataset is private so make sure you are logged in!\n" ] + }, + { + "data": { + "text/plain": [ + "Dataset(path='hub://davitbun/langchain_test_copy', tensors=['embedding', 'ids', 'metadata', 'text'])" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "vectordb.ds.summary()" + "import deeplake\n", + "username = \"davitbun\" # your username on app.activeloop.ai \n", + "source = f\"hub://{username}/langchain_test\" # could be local, s3, gcs, etc.\n", + "destination = f\"hub://{username}/langchain_test_copy\" # could be local, s3, gcs, etc.\n", + "\n", + "deeplake.deepcopy(src=source, dest=destination, overwrite=True)\n" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 76, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/davitbun/langchain_test_copy\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "hub://davitbun/langchain_test_copy loaded successfully.\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Deep Lake Dataset in hub://davitbun/langchain_test_copy already exists, loading from the storage\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset(path='hub://davitbun/langchain_test_copy', tensors=['embedding', 'ids', 'metadata', 'text'])\n", + "\n", + " tensor htype shape dtype compression\n", + " ------- ------- ------- ------- ------- \n", + " embedding generic (4, 1536) float32 None \n", + " ids text (4, 1) str None \n", + " metadata json (4, 1) str None \n", + " text text (4, 1) str None \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Evaluating ingest: 100%|██████████| 1/1 [00:31<00:00\n", + "-" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dataset(path='hub://davitbun/langchain_test_copy', tensors=['embedding', 'ids', 'metadata', 'text'])\n", + "\n", + " tensor htype shape dtype compression\n", + " ------- ------- ------- ------- ------- \n", + " embedding generic (8, 1536) float32 None \n", + " ids text (8, 1) str None \n", + " metadata json (8, 1) str None \n", + " text text (8, 1) str None \n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + " \r" + ] + }, + { + "data": { + "text/plain": [ + "['ad42f3fe-e188-11ed-b66d-41c5f7b85421',\n", + " 'ad42f3ff-e188-11ed-b66d-41c5f7b85421',\n", + " 'ad42f400-e188-11ed-b66d-41c5f7b85421',\n", + " 'ad42f401-e188-11ed-b66d-41c5f7b85421']" + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "embeddings = vectordb.ds.embedding.numpy()" + "db = DeepLake(dataset_path=destination, embedding_function=embeddings)\n", + "db.add_documents(docs)" ] }, { diff --git a/docs/use_cases/code/twitter-the-algorithm-analysis-deeplake.ipynb b/docs/use_cases/code/twitter-the-algorithm-analysis-deeplake.ipynb index cbfc09a3..04f689fe 100644 --- a/docs/use_cases/code/twitter-the-algorithm-analysis-deeplake.ipynb +++ b/docs/use_cases/code/twitter-the-algorithm-analysis-deeplake.ipynb @@ -40,8 +40,24 @@ "from langchain.vectorstores import DeepLake\n", "\n", "os.environ['OPENAI_API_KEY'] = getpass.getpass('OpenAI API Key:')\n", - "os.environ['ACTIVELOOP_TOKEN'] = getpass.getpass('Activeloop Token:')\n", - "embeddings = OpenAIEmbeddings()" + "os.environ['ACTIVELOOP_TOKEN'] = getpass.getpass('Activeloop Token:')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "embeddings = OpenAIEmbeddings(disallowed_special=())" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "disallowed_special=() is required to avoid `Exception: 'utf-8' codec can't decode byte 0xff in position 0: invalid start byte` from tiktoken for some repositories" ] }, { @@ -120,7 +136,9 @@ "metadata": {}, "outputs": [], "source": [ - "db = DeepLake.from_documents(texts, embeddings, dataset_path=\"hub://davitbun/twitter-algorithm\")" + "username = \"davitbun\" # replace with your username from app.activeloop.ai\n", + "db = DeepLake(dataset_path=f\"hub://{username}/twitter-algorithm\", embedding_function=embeddings, public=True) #dataset would be publicly available\n", + "db.add_documents(texts)" ] }, { @@ -133,61 +151,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "-" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "This dataset can be visualized in Jupyter Notebook by ds.visualize() or at https://app.activeloop.ai/davitbun/twitter-algorithm\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "-" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "hub://davitbun/twitter-algorithm loaded successfully.\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Deep Lake Dataset in hub://davitbun/twitter-algorithm already exists, loading from the storage\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dataset(path='hub://davitbun/twitter-algorithm', read_only=True, tensors=['embedding', 'ids', 'metadata', 'text'])\n", - "\n", - " tensor htype shape dtype compression\n", - " ------- ------- ------- ------- ------- \n", - " embedding generic (23152, 1536) float32 None \n", - " ids text (23152, 1) str None \n", - " metadata json (23152, 1) str None \n", - " text text (23152, 1) str None \n" - ] - } - ], + "outputs": [], "source": [ "db = DeepLake(dataset_path=\"hub://davitbun/twitter-algorithm\", read_only=True, embedding_function=embeddings)" ] @@ -203,7 +169,7 @@ "retriever.search_kwargs['distance_metric'] = 'cos'\n", "retriever.search_kwargs['fetch_k'] = 100\n", "retriever.search_kwargs['maximal_marginal_relevance'] = True\n", - "retriever.search_kwargs['k'] = 20" + "retriever.search_kwargs['k'] = 10" ] }, { @@ -241,7 +207,7 @@ "from langchain.chat_models import ChatOpenAI\n", "from langchain.chains import ConversationalRetrievalChain\n", "\n", - "model = ChatOpenAI(model='gpt-4') # 'gpt-3.5-turbo',\n", + "model = ChatOpenAI(model='gpt-3.5-turbo') # switch to 'gpt-4'\n", "qa = ConversationalRetrievalChain.from_llm(model,retriever=retriever)" ] }, diff --git a/docs/use_cases/question_answering/semantic-search-over-chat.ipynb b/docs/use_cases/question_answering/semantic-search-over-chat.ipynb index f1e2c654..3a53bc33 100644 --- a/docs/use_cases/question_answering/semantic-search-over-chat.ipynb +++ b/docs/use_cases/question_answering/semantic-search-over-chat.ipynb @@ -108,7 +108,7 @@ "\n", "dataset_path = 'hub://'+org+'/data'\n", "embeddings = OpenAIEmbeddings()\n", - "db = DeepLake.from_documents(texts, embeddings, dataset_path=dataset_path)" + "db = DeepLake.from_documents(texts, embeddings, dataset_path=dataset_path, overwrite=True)" ] }, { diff --git a/langchain/vectorstores/deeplake.py b/langchain/vectorstores/deeplake.py index a59c5a04..2f3f9970 100644 --- a/langchain/vectorstores/deeplake.py +++ b/langchain/vectorstores/deeplake.py @@ -43,6 +43,9 @@ def vector_search( returns: nearest_indices: List, indices of nearest neighbors """ + if data_vectors.shape[0] == 0: + return [], [] + # Calculate the distance between the query_vector and all data_vectors distances = distance_metric_map[distance_metric](query_embedding, data_vectors) nearest_indices = np.argsort(distances) @@ -87,7 +90,7 @@ class DeepLake(VectorStore): vectorstore = DeepLake("langchain_store", embeddings.embed_query) """ - _LANGCHAIN_DEFAULT_DEEPLAKE_PATH = "mem://langchain" + _LANGCHAIN_DEFAULT_DEEPLAKE_PATH = "./deeplake/" def __init__( self, @@ -96,7 +99,7 @@ class DeepLake(VectorStore): embedding_function: Optional[Embeddings] = None, read_only: Optional[bool] = False, ingestion_batch_size: int = 1024, - num_workers: int = 4, + num_workers: int = 0, **kwargs: Any, ) -> None: """Initialize with Deep Lake client.""" @@ -112,8 +115,13 @@ class DeepLake(VectorStore): "Please install it with `pip install deeplake`." ) self._deeplake = deeplake + self.dataset_path = dataset_path + creds_args = {"creds": kwargs["creds"]} if "creds" in kwargs else {} - if deeplake.exists(dataset_path, token=token): + if ( + deeplake.exists(dataset_path, token=token, **creds_args) + and "overwrite" not in kwargs + ): self.ds = deeplake.load( dataset_path, token=token, read_only=read_only, **kwargs ) @@ -123,6 +131,9 @@ class DeepLake(VectorStore): ) self.ds.summary() else: + if "overwrite" in kwargs: + del kwargs["overwrite"] + self.ds = deeplake.empty( dataset_path, token=token, overwrite=True, **kwargs ) @@ -215,6 +226,9 @@ class DeepLake(VectorStore): ) batch_size = min(self.ingestion_batch_size, len(elements)) + if batch_size == 0: + return [] + batched = [ elements[i : i + batch_size] for i in range(0, len(elements), batch_size) ] @@ -222,7 +236,8 @@ class DeepLake(VectorStore): ingest().eval( batched, self.ds, - num_workers=min(self.num_workers, len(batched) // self.num_workers), + num_workers=min(self.num_workers, len(batched) // max(self.num_workers, 1)), + **kwargs, ) self.ds.commit(allow_empty=True) self.ds.summary() @@ -443,8 +458,8 @@ class DeepLake(VectorStore): ) -> DeepLake: """Create a Deep Lake dataset from a raw documents. - If a dataset_path is specified, the dataset will be persisted there. - Otherwise, the data will be ephemeral in-memory. + If a dataset_path is specified, the dataset will be persisted in that location, + otherwise by default at `./deeplake` Args: path (str, pathlib.Path): - The full path to the dataset. Can be: @@ -493,7 +508,7 @@ class DeepLake(VectorStore): Defaults to None. """ if delete_all: - self.ds.delete() + self.ds.delete(large_ok=True) return True view = None @@ -515,6 +530,18 @@ class DeepLake(VectorStore): return True + @classmethod + def force_delete_by_path(cls, path: str) -> None: + """Force delete dataset by path""" + try: + import deeplake + except ImportError: + raise ValueError( + "Could not import deeplake python package. " + "Please install it with `pip install deeplake`." + ) + deeplake.delete(path, large_ok=True, force=True) + def delete_dataset(self) -> None: """Delete the collection.""" self.delete(delete_all=True) diff --git a/poetry.lock b/poetry.lock index 48512472..1138b919 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. [[package]] name = "absl-py" @@ -1413,17 +1413,17 @@ files = [ [[package]] name = "deeplake" -version = "3.2.22" +version = "3.3.0" description = "Activeloop Deep Lake" category = "main" optional = false python-versions = "*" files = [ - {file = "deeplake-3.2.22.tar.gz", hash = "sha256:068280561366dd1bd891d3ffda8638ec59860a23b9426815a484b0591ab467a6"}, + {file = "deeplake-3.3.0.tar.gz", hash = "sha256:161663ccba922156912a0ddace7133284487732b8d671fc64c74519ccce62d96"}, ] [package.dependencies] -aioboto3 = {version = "10.4.0", markers = "python_version >= \"3.7\" and sys_platform != \"win32\""} +aioboto3 = {version = ">=10.4.0", markers = "python_version >= \"3.7\" and sys_platform != \"win32\""} boto3 = "*" click = "*" humbug = ">=0.3.1" @@ -1436,11 +1436,10 @@ pyjwt = "*" tqdm = "*" [package.extras] -all = ["IPython", "av (>=8.1.0)", "flask", "google-api-python-client (>=2.31.0,<2.32.0)", "google-auth (>=2.0.1,<2.1.0)", "google-auth-oauthlib (>=0.4.5,<0.5.0)", "google-cloud-storage (>=1.42.0,<1.43.0)", "laspy", "libdeeplake (==0.0.41)", "nibabel", "oauth2client (>=4.1.3,<4.2.0)", "pydicom"] +all = ["IPython", "av (>=8.1.0)", "flask", "google-api-python-client (>=2.31.0,<2.32.0)", "google-auth (>=2.0.1,<2.1.0)", "google-auth-oauthlib (>=0.4.5,<0.5.0)", "google-cloud-storage (>=1.42.0,<1.43.0)", "laspy", "nibabel", "oauth2client (>=4.1.3,<4.2.0)", "pydicom"] audio = ["av (>=8.1.0)"] av = ["av (>=8.1.0)"] dicom = ["nibabel", "pydicom"] -enterprise = ["libdeeplake (==0.0.41)", "pyjwt"] gcp = ["google-auth (>=2.0.1,<2.1.0)", "google-auth-oauthlib (>=0.4.5,<0.5.0)", "google-cloud-storage (>=1.42.0,<1.43.0)"] gdrive = ["google-api-python-client (>=2.31.0,<2.32.0)", "google-auth (>=2.0.1,<2.1.0)", "google-auth-oauthlib (>=0.4.5,<0.5.0)", "oauth2client (>=4.1.3,<4.2.0)"] medical = ["nibabel", "pydicom"] @@ -7508,7 +7507,7 @@ files = [ ] [package.dependencies] -greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and (platform_machine == \"aarch64\" or platform_machine == \"ppc64le\" or platform_machine == \"x86_64\" or platform_machine == \"amd64\" or platform_machine == \"AMD64\" or platform_machine == \"win32\" or platform_machine == \"WIN32\")"} +greenlet = {version = "!=0.4.17", markers = "python_version >= \"3\" and platform_machine == \"aarch64\" or python_version >= \"3\" and platform_machine == \"ppc64le\" or python_version >= \"3\" and platform_machine == \"x86_64\" or python_version >= \"3\" and platform_machine == \"amd64\" or python_version >= \"3\" and platform_machine == \"AMD64\" or python_version >= \"3\" and platform_machine == \"win32\" or python_version >= \"3\" and platform_machine == \"WIN32\""} [package.extras] aiomysql = ["aiomysql", "greenlet (!=0.4.17)"] @@ -9268,13 +9267,13 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect"] +all = ["aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] cohere = ["cohere"] -llms = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "manifest-ml", "torch", "transformers"] +llms = ["anthropic", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "torch", "transformers"] openai = ["openai"] qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "da027a1b27f348548ca828c6da40795e2f57a7a7858bdeac1a08573d3e031e12" +content-hash = "ab6ea1c53c7a6e792d5bdcf8865b87e5dcfe4c89080c18b356dc4ed8a17cc3a3" diff --git a/pyproject.toml b/pyproject.toml index de48aaf0..0eec4645 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -56,7 +56,7 @@ arxiv = {version = "^1.4", optional = true} pypdf = {version = "^3.4.0", optional = true} networkx = {version="^2.6.3", optional = true} aleph-alpha-client = {version="^2.15.0", optional = true} -deeplake = {version = "^3.2.21", optional = true} +deeplake = {version = "^3.3.0", optional = true} pgvector = {version = "^0.1.6", optional = true} psycopg2-binary = {version = "^2.9.5", optional = true} #boto3 = {version = "^1.26.96", optional = true} # TODO: fix it, commented because the version failed with deeplake diff --git a/tests/integration_tests/vectorstores/test_deeplake.py b/tests/integration_tests/vectorstores/test_deeplake.py index 2463a4bb..f858c904 100644 --- a/tests/integration_tests/vectorstores/test_deeplake.py +++ b/tests/integration_tests/vectorstores/test_deeplake.py @@ -164,3 +164,10 @@ def test_delete_dataset_by_filter(deeplake_datastore: DeepLake) -> None: assert len(deeplake_datastore.ds) == 2 deeplake_datastore.delete_dataset() + + +def test_delete_by_path(deeplake_datastore: DeepLake) -> None: + """Test delete dataset.""" + path = deeplake_datastore.dataset_path + DeepLake.force_delete_by_path(path) + assert not deeplake.exists(path)