From 25ec655e4fd134ca15bfdb9f7640769c7e811d02 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Thu, 7 Sep 2023 10:04:49 -0700 Subject: [PATCH 01/13] supabase embedding usage fix (#10335) Should be calling Embeddings.embed_query instead of embed_documents when searching --- libs/langchain/langchain/vectorstores/supabase.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/libs/langchain/langchain/vectorstores/supabase.py b/libs/langchain/langchain/vectorstores/supabase.py index d911965346..4214e085f6 100644 --- a/libs/langchain/langchain/vectorstores/supabase.py +++ b/libs/langchain/langchain/vectorstores/supabase.py @@ -168,10 +168,8 @@ class SupabaseVectorStore(VectorStore): filter: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> List[Document]: - vectors = self._embedding.embed_documents([query]) - return self.similarity_search_by_vector( - vectors[0], k=k, filter=filter, **kwargs - ) + vector = self._embedding.embed_query(query) + return self.similarity_search_by_vector(vector, k=k, filter=filter, **kwargs) def similarity_search_by_vector( self, @@ -195,9 +193,9 @@ class SupabaseVectorStore(VectorStore): filter: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: - vectors = self._embedding.embed_documents([query]) + vector = self._embedding.embed_query(query) return self.similarity_search_by_vector_with_relevance_scores( - vectors[0], k=k, filter=filter + vector, k=k, filter=filter ) def match_args( @@ -407,9 +405,9 @@ class SupabaseVectorStore(VectorStore): $$; ``` """ - embedding = self._embedding.embed_documents([query]) + embedding = self._embedding.embed_query(query) docs = self.max_marginal_relevance_search_by_vector( - embedding[0], k, fetch_k, lambda_mult=lambda_mult + embedding, k, fetch_k, lambda_mult=lambda_mult ) return docs From a9eb7c6cfc99223df658cecd1cd5ad331bba3c53 Mon Sep 17 00:00:00 2001 From: Ofer Mendelevitch Date: Thu, 7 Sep 2023 10:24:50 -0700 Subject: [PATCH 02/13] Adding Self-querying for Vectara (#10332) - Description: Adding support for self-querying to Vectara integration - Issue: per customer request - Tag maintainer: @rlancemartin @baskaryan - Twitter handle: @ofermend Also updated some documentation, added self-query testing, and a demo notebook with self-query example. --- .../integrations/providers/vectara/index.mdx | 5 +- .../integrations/vectorstores/vectara.ipynb | 178 +++++-- .../self_query/vectara_self_query.ipynb | 440 ++++++++++++++++++ .../langchain/retrievers/self_query/base.py | 3 + .../retrievers/self_query/vectara.py | 69 +++ .../langchain/vectorstores/vectara.py | 8 +- .../document_loaders/test_polars_dataframe.py | 2 - .../retrievers/self_query/test_vectara.py | 71 +++ 8 files changed, 743 insertions(+), 33 deletions(-) create mode 100644 docs/extras/modules/data_connection/retrievers/self_query/vectara_self_query.ipynb create mode 100644 libs/langchain/langchain/retrievers/self_query/vectara.py create mode 100644 libs/langchain/tests/unit_tests/retrievers/self_query/test_vectara.py diff --git a/docs/extras/integrations/providers/vectara/index.mdx b/docs/extras/integrations/providers/vectara/index.mdx index 0e30735f0b..abd8283735 100644 --- a/docs/extras/integrations/providers/vectara/index.mdx +++ b/docs/extras/integrations/providers/vectara/index.mdx @@ -11,9 +11,10 @@ What is Vectara? - You can use Vectara's integration with LangChain as a Vector store or using the Retriever abstraction. ## Installation and Setup -To use Vectara with LangChain no special installation steps are required. You just have to provide your customer_id, corpus ID, and an API key created within the Vectara console to enable indexing and searching. +To use Vectara with LangChain no special installation steps are required. +To get started, follow our [quickstart](https://docs.vectara.com/docs/quickstart) guide to create an account, a corpus and an API key. +Once you have these, you can provide them as arguments to the Vectara vectorstore, or you can set them as environment variables. -Alternatively these can be provided as environment variables - export `VECTARA_CUSTOMER_ID`="your_customer_id" - export `VECTARA_CORPUS_ID`="your_corpus_id" - export `VECTARA_API_KEY`="your-vectara-api-key" diff --git a/docs/extras/integrations/vectorstores/vectara.ipynb b/docs/extras/integrations/vectorstores/vectara.ipynb index 0741c1b199..e95504860b 100644 --- a/docs/extras/integrations/vectorstores/vectara.ipynb +++ b/docs/extras/integrations/vectorstores/vectara.ipynb @@ -26,7 +26,7 @@ "source": [ "# Setup\n", "\n", - "You will need a Vectara account to use Vectara with LangChain. To get started, use the following steps:\n", + "You will need a Vectara account to use Vectara with LangChain. To get started, use the following steps (see our [quickstart](https://docs.vectara.com/docs/quickstart) guide):\n", "1. [Sign up](https://console.vectara.com/signup) for a Vectara account if you don't already have one. Once you have completed your sign up you will have a Vectara customer ID. You can find your customer ID by clicking on your name, on the top-right of the Vectara console window.\n", "2. Within your account you can create one or more corpora. Each corpus represents an area that stores text data upon ingest from input documents. To create a corpus, use the **\"Create Corpus\"** button. You then provide a name to your corpus as well as a description. Optionally you can define filtering attributes and apply some advanced options. If you click on your created corpus, you can see its name and corpus ID right on the top.\n", "3. Next you'll need to create API keys to access the corpus. Click on the **\"Authorization\"** tab in the corpus view and then the **\"Create API Key\"** button. Give your key a name, and choose whether you want query only or query+index for your key. Click \"Create\" and you now have an active API key. Keep this key confidential. \n", @@ -47,7 +47,7 @@ "os.environ[\"VECTARA_API_KEY\"] = getpass.getpass(\"Vectara API Key:\")\n", "```\n", "\n", - "2. Add them to the Vectara vectorstore constructor:\n", + "1. Provide them as arguments when creating the Vectara vectorstore object:\n", "\n", "```python\n", "vectorstore = Vectara(\n", @@ -65,13 +65,22 @@ "source": [ "## Connecting to Vectara from LangChain\n", "\n", - "To get started, let's ingest the documents using the from_documents() method.\n", - "We assume here that you've added your VECTARA_CUSTOMER_ID, VECTARA_CORPUS_ID and query+indexing VECTARA_API_KEY as environment variables." + "In this example, we assume that you've created an account and a corpus, and added your VECTARA_CUSTOMER_ID, VECTARA_CORPUS_ID and VECTARA_API_KEY (created with permissions for both indexing and query) as environment variables.\n", + "\n", + "The corpus has 3 fields defined as metadata for filtering:\n", + "* url: a string field containing the source URL of the document (where relevant)\n", + "* speech: a string field containing the name of the speech\n", + "* author: the name of the author\n", + "\n", + "Let's start by ingesting 3 documents into the corpus:\n", + "1. The State of the Union speech from 2022, available in the LangChain repository as a text file\n", + "2. The \"I have a dream\" speech by Dr. Kind\n", + "3. The \"We shall Fight on the Beaches\" speech by Winston Churchil" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "04a1f1a0", "metadata": {}, "outputs": [], @@ -79,12 +88,17 @@ "from langchain.embeddings import FakeEmbeddings\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.vectorstores import Vectara\n", - "from langchain.document_loaders import TextLoader" + "from langchain.document_loaders import TextLoader\n", + "\n", + "from langchain.llms import OpenAI\n", + "from langchain.chains import ConversationalRetrievalChain\n", + "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", + "from langchain.chains.query_constructor.base import AttributeInfo" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "be0a4973", "metadata": {}, "outputs": [], @@ -97,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "8429667e", "metadata": { "ExecuteTime": { @@ -111,7 +125,7 @@ "vectara = Vectara.from_documents(\n", " docs,\n", " embedding=FakeEmbeddings(size=768),\n", - " doc_metadata={\"speech\": \"state-of-the-union\"},\n", + " doc_metadata={\"speech\": \"state-of-the-union\", \"author\": \"Biden\"},\n", ")" ] }, @@ -130,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "85ef3468", "metadata": {}, "outputs": [], @@ -142,14 +156,16 @@ " [\n", " \"https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf\",\n", " \"I-have-a-dream\",\n", + " \"Dr. King\"\n", " ],\n", " [\n", " \"https://www.parkwayschools.net/cms/lib/MO01931486/Centricity/Domain/1578/Churchill_Beaches_Speech.pdf\",\n", " \"we shall fight on the beaches\",\n", + " \"Churchil\"\n", " ],\n", "]\n", "files_list = []\n", - "for url, _ in urls:\n", + "for url, _, _ in urls:\n", " name = tempfile.NamedTemporaryFile().name\n", " urllib.request.urlretrieve(url, name)\n", " files_list.append(name)\n", @@ -157,7 +173,7 @@ "docsearch: Vectara = Vectara.from_files(\n", " files=files_list,\n", " embedding=FakeEmbeddings(size=768),\n", - " metadatas=[{\"url\": url, \"speech\": title} for url, title in urls],\n", + " metadatas=[{\"url\": url, \"speech\": title, \"author\": author} for url, title, author in urls],\n", ")" ] }, @@ -178,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "a8c513ab", "metadata": { "ExecuteTime": { @@ -197,7 +213,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "fc516993", "metadata": { "ExecuteTime": { @@ -231,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "8804a21d", "metadata": { "ExecuteTime": { @@ -249,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "756a6887", "metadata": { "ExecuteTime": { @@ -264,7 +280,7 @@ "text": [ "Justice Breyer, thank you for your service. One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. A former top litigator in private practice.\n", "\n", - "Score: 0.786569\n" + "Score: 0.8299499\n" ] } ], @@ -284,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "47784de5", "metadata": {}, "outputs": [ @@ -307,7 +323,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "3e22949f", "metadata": {}, "outputs": [ @@ -315,7 +331,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "With this threshold of 0.2 we have 3 documents\n" + "With this threshold of 0.2 we have 5 documents\n" ] } ], @@ -340,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "9427195f", "metadata": { "ExecuteTime": { @@ -352,10 +368,10 @@ { "data": { "text/plain": [ - "VectaraRetriever(tags=['Vectara'], metadata=None, vectorstore=, search_type='similarity', search_kwargs={'lambda_val': 0.025, 'k': 5, 'filter': '', 'n_sentence_context': '2'})" + "VectaraRetriever(tags=['Vectara'], metadata=None, vectorstore=, search_type='similarity', search_kwargs={'lambda_val': 0.025, 'k': 5, 'filter': '', 'n_sentence_context': '2'})" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -367,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "f3c70c31", "metadata": { "ExecuteTime": { @@ -379,10 +395,10 @@ { "data": { "text/plain": [ - "Document(page_content='Justice Breyer, thank you for your service. One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. A former top litigator in private practice.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '596', 'len': '97', 'speech': 'state-of-the-union'})" + "Document(page_content='Justice Breyer, thank you for your service. One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. A former top litigator in private practice.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '596', 'len': '97', 'speech': 'state-of-the-union', 'author': 'Biden'})" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -392,10 +408,118 @@ "retriever.get_relevant_documents(query)[0]" ] }, + { + "cell_type": "markdown", + "id": "e944c26a", + "metadata": {}, + "source": [ + "## Using Vectara as a SelfQuery Retriever" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "8be674de", + "metadata": {}, + "outputs": [], + "source": [ + "metadata_field_info = [\n", + " AttributeInfo(\n", + " name=\"speech\",\n", + " description=\"what name of the speech\",\n", + " type=\"string or list[string]\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"author\",\n", + " description=\"author of the speech\",\n", + " type=\"string or list[string]\",\n", + " ),\n", + "]\n", + "document_content_description = \"the text of the speech\"\n", + "\n", + "vectordb = Vectara()\n", + "llm = OpenAI(temperature=0)\n", + "retriever = SelfQueryRetriever.from_llm(llm, vectara, \n", + " document_content_description, metadata_field_info, \n", + " verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "f8938999", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ofer/dev/langchain/libs/langchain/langchain/chains/llm.py:278: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='freedom' filter=Comparison(comparator=, attribute='author', value='Biden') limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Well I know this nation. We will meet the test. To protect freedom and liberty, to expand fairness and opportunity. We will save democracy. As hard as these times have been, I am more optimistic about America today than I have been my whole life.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '346', 'len': '67', 'speech': 'state-of-the-union', 'author': 'Biden'}),\n", + " Document(page_content='To our fellow Ukrainian Americans who forge a deep bond that connects our two nations we stand with you. Putin may circle Kyiv with tanks, but he will never gain the hearts and souls of the Ukrainian people. He will never extinguish their love of freedom. He will never weaken the resolve of the free world. We meet tonight in an America that has lived through two of the hardest years this nation has ever faced.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '740', 'len': '47', 'speech': 'state-of-the-union', 'author': 'Biden'}),\n", + " Document(page_content='But most importantly as Americans. With a duty to one another to the American people to the Constitution. And with an unwavering resolve that freedom will always triumph over tyranny. Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '413', 'len': '77', 'speech': 'state-of-the-union', 'author': 'Biden'}),\n", + " Document(page_content='We can do this. \\n\\nMy fellow Americans—tonight , we have gathered in a sacred space—the citadel of our democracy. In this Capitol, generation after generation, Americans have debated great questions amid great strife, and have done great things. We have fought for freedom, expanded liberty, defeated totalitarianism and terror. And built the strongest, freest, and most prosperous nation the world has ever known. Now is the hour. \\n\\nOur moment of responsibility.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '906', 'len': '82', 'speech': 'state-of-the-union', 'author': 'Biden'}),\n", + " Document(page_content='In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. We cannot let this happen. Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '0', 'len': '63', 'speech': 'state-of-the-union', 'author': 'Biden'})]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "retriever.get_relevant_documents(\"what did Biden say about the freedom?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "a97037fb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='freedom' filter=Comparison(comparator=, attribute='author', value='Dr. King') limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='And if America is to be a great nation, this must become true. So\\nlet freedom ring from the prodigious hilltops of New Hampshire. Let freedom ring from the mighty\\nmountains of New York. Let freedom ring from the heightening Alleghenies of Pennsylvania. Let\\nfreedom ring from the snowcapped Rockies of Colorado.', metadata={'lang': 'eng', 'section': '3', 'offset': '1534', 'len': '55', 'CreationDate': '1424880481', 'Producer': 'Adobe PDF Library 10.0', 'Author': 'Sasha Rolon-Pereira', 'Title': 'Martin Luther King Jr.pdf', 'Creator': 'Acrobat PDFMaker 10.1 for Word', 'ModDate': '1424880524', 'url': 'https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf', 'speech': 'I-have-a-dream', 'author': 'Dr. King', 'title': 'Martin Luther King Jr.pdf'}),\n", + " Document(page_content='And if America is to be a great nation, this must become true. So\\nlet freedom ring from the prodigious hilltops of New Hampshire. Let freedom ring from the mighty\\nmountains of New York. Let freedom ring from the heightening Alleghenies of Pennsylvania. Let\\nfreedom ring from the snowcapped Rockies of Colorado.', metadata={'lang': 'eng', 'section': '3', 'offset': '1534', 'len': '55', 'CreationDate': '1424880481', 'Producer': 'Adobe PDF Library 10.0', 'Author': 'Sasha Rolon-Pereira', 'Title': 'Martin Luther King Jr.pdf', 'Creator': 'Acrobat PDFMaker 10.1 for Word', 'ModDate': '1424880524', 'url': 'https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf', 'speech': 'I-have-a-dream', 'author': 'Dr. King', 'title': 'Martin Luther King Jr.pdf'}),\n", + " Document(page_content='Let freedom ring from the curvaceous slopes of\\nCalifornia. But not only that. Let freedom ring from Stone Mountain of Georgia. Let freedom ring from Lookout\\nMountain of Tennessee. Let freedom ring from every hill and molehill of Mississippi, from every\\nmountain side. Let freedom ring . . .\\nWhen we allow freedom to ring—when we let it ring from every city and every hamlet, from every state\\nand every city, we will be able to speed up that day when all of God’s children, black men and white\\nmen, Jews and Gentiles, Protestants and Catholics, will be able to join hands and sing in the words of the\\nold Negro spiritual, “Free at last, Free at last, Great God a-mighty, We are free at last.”', metadata={'lang': 'eng', 'section': '3', 'offset': '1842', 'len': '52', 'CreationDate': '1424880481', 'Producer': 'Adobe PDF Library 10.0', 'Author': 'Sasha Rolon-Pereira', 'Title': 'Martin Luther King Jr.pdf', 'Creator': 'Acrobat PDFMaker 10.1 for Word', 'ModDate': '1424880524', 'url': 'https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf', 'speech': 'I-have-a-dream', 'author': 'Dr. King', 'title': 'Martin Luther King Jr.pdf'}),\n", + " Document(page_content='Let freedom ring from the curvaceous slopes of\\nCalifornia. But not only that. Let freedom ring from Stone Mountain of Georgia. Let freedom ring from Lookout\\nMountain of Tennessee. Let freedom ring from every hill and molehill of Mississippi, from every\\nmountain side. Let freedom ring . . .\\nWhen we allow freedom to ring—when we let it ring from every city and every hamlet, from every state\\nand every city, we will be able to speed up that day when all of God’s children, black men and white\\nmen, Jews and Gentiles, Protestants and Catholics, will be able to join hands and sing in the words of the\\nold Negro spiritual, “Free at last, Free at last, Great God a-mighty, We are free at last.”', metadata={'lang': 'eng', 'section': '3', 'offset': '1842', 'len': '52', 'CreationDate': '1424880481', 'Producer': 'Adobe PDF Library 10.0', 'Author': 'Sasha Rolon-Pereira', 'Title': 'Martin Luther King Jr.pdf', 'Creator': 'Acrobat PDFMaker 10.1 for Word', 'ModDate': '1424880524', 'url': 'https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf', 'speech': 'I-have-a-dream', 'author': 'Dr. King', 'title': 'Martin Luther King Jr.pdf'}),\n", + " Document(page_content='Let freedom ring from the mighty\\nmountains of New York. Let freedom ring from the heightening Alleghenies of Pennsylvania. Let\\nfreedom ring from the snowcapped Rockies of Colorado. Let freedom ring from the curvaceous slopes of\\nCalifornia. But not only that. Let freedom ring from Stone Mountain of Georgia.', metadata={'lang': 'eng', 'section': '3', 'offset': '1657', 'len': '57', 'CreationDate': '1424880481', 'Producer': 'Adobe PDF Library 10.0', 'Author': 'Sasha Rolon-Pereira', 'Title': 'Martin Luther King Jr.pdf', 'Creator': 'Acrobat PDFMaker 10.1 for Word', 'ModDate': '1424880524', 'url': 'https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf', 'speech': 'I-have-a-dream', 'author': 'Dr. King', 'title': 'Martin Luther King Jr.pdf'})]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "retriever.get_relevant_documents(\"what did Dr. King say about the freedom?\")" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "2300e785", + "id": "f6d17e90", "metadata": {}, "outputs": [], "source": [] diff --git a/docs/extras/modules/data_connection/retrievers/self_query/vectara_self_query.ipynb b/docs/extras/modules/data_connection/retrievers/self_query/vectara_self_query.ipynb new file mode 100644 index 0000000000..1e9128dc6f --- /dev/null +++ b/docs/extras/modules/data_connection/retrievers/self_query/vectara_self_query.ipynb @@ -0,0 +1,440 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "13afcae7", + "metadata": {}, + "source": [ + "# Vectara self-querying \n", + "\n", + ">[Vectara](https://docs.vectara.com/docs/) is a GenAI platform for developers. It provides a simple API to build Grounded Generation (aka Retrieval-augmented-generation) applications.\n", + "\n", + "In the notebook we'll demo the `SelfQueryRetriever` wrapped around a Vectara vector store. " + ] + }, + { + "cell_type": "markdown", + "id": "68e75fb9", + "metadata": {}, + "source": [ + "# Setup\n", + "\n", + "You will need a Vectara account to use Vectara with LangChain. To get started, use the following steps (see our [quickstart](https://docs.vectara.com/docs/quickstart) guide):\n", + "1. [Sign up](https://console.vectara.com/signup) for a Vectara account if you don't already have one. Once you have completed your sign up you will have a Vectara customer ID. You can find your customer ID by clicking on your name, on the top-right of the Vectara console window.\n", + "2. Within your account you can create one or more corpora. Each corpus represents an area that stores text data upon ingest from input documents. To create a corpus, use the **\"Create Corpus\"** button. You then provide a name to your corpus as well as a description. Optionally you can define filtering attributes and apply some advanced options. If you click on your created corpus, you can see its name and corpus ID right on the top.\n", + "3. Next you'll need to create API keys to access the corpus. Click on the **\"Authorization\"** tab in the corpus view and then the **\"Create API Key\"** button. Give your key a name, and choose whether you want query only or query+index for your key. Click \"Create\" and you now have an active API key. Keep this key confidential. \n", + "\n", + "To use LangChain with Vectara, you'll need to have these three values: customer ID, corpus ID and api_key.\n", + "You can provide those to LangChain in two ways:\n", + "\n", + "1. Include in your environment these three variables: `VECTARA_CUSTOMER_ID`, `VECTARA_CORPUS_ID` and `VECTARA_API_KEY`.\n", + "\n", + "> For example, you can set these variables using os.environ and getpass as follows:\n", + "\n", + "```python\n", + "import os\n", + "import getpass\n", + "\n", + "os.environ[\"VECTARA_CUSTOMER_ID\"] = getpass.getpass(\"Vectara Customer ID:\")\n", + "os.environ[\"VECTARA_CORPUS_ID\"] = getpass.getpass(\"Vectara Corpus ID:\")\n", + "os.environ[\"VECTARA_API_KEY\"] = getpass.getpass(\"Vectara API Key:\")\n", + "```\n", + "\n", + "1. Provide them as arguments when creating the Vectara vectorstore object:\n", + "\n", + "```python\n", + "vectorstore = Vectara(\n", + " vectara_customer_id=vectara_customer_id,\n", + " vectara_corpus_id=vectara_corpus_id,\n", + " vectara_api_key=vectara_api_key\n", + " )\n", + "```\n", + "\n", + "**Note:** The self-query retriever requires you to have `lark` installed (`pip install lark`). " + ] + }, + { + "cell_type": "markdown", + "id": "742ac16d", + "metadata": {}, + "source": [ + "## Connecting to Vectara from LangChain\n", + "\n", + "In this example, we assume that you've created an account and a corpus, and added your VECTARA_CUSTOMER_ID, VECTARA_CORPUS_ID and VECTARA_API_KEY (created with permissions for both indexing and query) as environment variables.\n", + "\n", + "The corpus has 4 fields defined as metadata for filtering: year, director, rating, and genre\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "cb4a5787", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.embeddings import FakeEmbeddings\n", + "from langchain.schema import Document\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import Vectara\n", + "from langchain.document_loaders import TextLoader\n", + "\n", + "from langchain.llms import OpenAI\n", + "from langchain.chains import ConversationalRetrievalChain\n", + "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", + "from langchain.chains.query_constructor.base import AttributeInfo\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bcbe04d9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "docs = [\n", + " Document(\n", + " page_content=\"A bunch of scientists bring back dinosaurs and mayhem breaks loose\",\n", + " metadata={\"year\": 1993, \"rating\": 7.7, \"genre\": \"science fiction\"},\n", + " ),\n", + " Document(\n", + " page_content=\"Leo DiCaprio gets lost in a dream within a dream within a dream within a ...\",\n", + " metadata={\"year\": 2010, \"director\": \"Christopher Nolan\", \"rating\": 8.2},\n", + " ),\n", + " Document(\n", + " page_content=\"A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea\",\n", + " metadata={\"year\": 2006, \"director\": \"Satoshi Kon\", \"rating\": 8.6},\n", + " ),\n", + " Document(\n", + " page_content=\"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n", + " metadata={\"year\": 2019, \"director\": \"Greta Gerwig\", \"rating\": 8.3},\n", + " ),\n", + " Document(\n", + " page_content=\"Toys come alive and have a blast doing so\",\n", + " metadata={\"year\": 1995, \"genre\": \"animated\"},\n", + " ),\n", + " Document(\n", + " page_content=\"Three men walk into the Zone, three men walk out of the Zone\",\n", + " metadata={\n", + " \"year\": 1979,\n", + " \"rating\": 9.9,\n", + " \"director\": \"Andrei Tarkovsky\",\n", + " \"genre\": \"science fiction\",\n", + " },\n", + " ),\n", + "]\n", + "\n", + "vectara = Vectara()\n", + "for doc in docs:\n", + " vectara.add_texts([doc.page_content], embedding=FakeEmbeddings(size=768), doc_metadata=doc.metadata)" + ] + }, + { + "cell_type": "markdown", + "id": "5ecaab6d", + "metadata": {}, + "source": [ + "## Creating our self-querying retriever\n", + "Now we can instantiate our retriever. To do this we'll need to provide some information upfront about the metadata fields that our documents support and a short description of the document contents." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "86e34dbf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.llms import OpenAI\n", + "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", + "from langchain.chains.query_constructor.base import AttributeInfo\n", + "\n", + "metadata_field_info = [\n", + " AttributeInfo(\n", + " name=\"genre\",\n", + " description=\"The genre of the movie\",\n", + " type=\"string or list[string]\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"year\",\n", + " description=\"The year the movie was released\",\n", + " type=\"integer\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"director\",\n", + " description=\"The name of the movie director\",\n", + " type=\"string\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"rating\", description=\"A 1-10 rating for the movie\", type=\"float\"\n", + " ),\n", + "]\n", + "document_content_description = \"Brief summary of a movie\"\n", + "llm = OpenAI(temperature=0)\n", + "retriever = SelfQueryRetriever.from_llm(\n", + " llm, vectara, document_content_description, metadata_field_info, verbose=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ea9df8d4", + "metadata": {}, + "source": [ + "## Testing it out\n", + "And now we can try actually using our retriever!" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "38a126e9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ofer/dev/langchain/libs/langchain/langchain/chains/llm.py:278: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='dinosaur' filter=None limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'lang': 'eng', 'offset': '0', 'len': '66', 'year': '1993', 'rating': '7.7', 'genre': 'science fiction', 'source': 'langchain'}),\n", + " Document(page_content='Toys come alive and have a blast doing so', metadata={'lang': 'eng', 'offset': '0', 'len': '41', 'year': '1995', 'genre': 'animated', 'source': 'langchain'}),\n", + " Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'lang': 'eng', 'offset': '0', 'len': '60', 'year': '1979', 'rating': '9.9', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'source': 'langchain'}),\n", + " Document(page_content='Leo DiCaprio gets lost in a dream within a dream within a dream within a ...', metadata={'lang': 'eng', 'offset': '0', 'len': '76', 'year': '2010', 'director': 'Christopher Nolan', 'rating': '8.2', 'source': 'langchain'}),\n", + " Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'lang': 'eng', 'offset': '0', 'len': '116', 'year': '2006', 'director': 'Satoshi Kon', 'rating': '8.6', 'source': 'langchain'})]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a relevant query\n", + "retriever.get_relevant_documents(\"What are some movies about dinosaurs\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fc3f1e6e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Comparison(comparator=, attribute='rating', value=8.5) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'lang': 'eng', 'offset': '0', 'len': '60', 'year': '1979', 'rating': '9.9', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'source': 'langchain'}),\n", + " Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'lang': 'eng', 'offset': '0', 'len': '116', 'year': '2006', 'director': 'Satoshi Kon', 'rating': '8.6', 'source': 'langchain'})]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a filter\n", + "retriever.get_relevant_documents(\"I want to watch a movie rated higher than 8.5\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b19d4da0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='women' filter=Comparison(comparator=, attribute='director', value='Greta Gerwig') limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them', metadata={'lang': 'eng', 'offset': '0', 'len': '82', 'year': '2019', 'director': 'Greta Gerwig', 'rating': '8.3', 'source': 'langchain'})]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a query and a filter\n", + "retriever.get_relevant_documents(\"Has Greta Gerwig directed any movies about women\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f900e40e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Operation(operator=, arguments=[Comparison(comparator=, attribute='rating', value=8.5), Comparison(comparator=, attribute='genre', value='science fiction')]) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'lang': 'eng', 'offset': '0', 'len': '60', 'year': '1979', 'rating': '9.9', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'source': 'langchain'})]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a composite filter\n", + "retriever.get_relevant_documents(\n", + " \"What's a highly rated (above 8.5) science fiction film?\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "12a51522", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='toys' filter=Operation(operator=, arguments=[Comparison(comparator=, attribute='year', value=1990), Comparison(comparator=, attribute='year', value=2005), Comparison(comparator=, attribute='genre', value='animated')]) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Toys come alive and have a blast doing so', metadata={'lang': 'eng', 'offset': '0', 'len': '41', 'year': '1995', 'genre': 'animated', 'source': 'langchain'})]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a query and composite filter\n", + "retriever.get_relevant_documents(\n", + " \"What's a movie after 1990 but before 2005 that's all about toys, and preferably is animated\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "39bd1de1-b9fe-4a98-89da-58d8a7a6ae51", + "metadata": {}, + "source": [ + "## Filter k\n", + "\n", + "We can also use the self query retriever to specify `k`: the number of documents to fetch.\n", + "\n", + "We can do this by passing `enable_limit=True` to the constructor." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "bff36b88-b506-4877-9c63-e5a1a8d78e64", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "retriever = SelfQueryRetriever.from_llm(\n", + " llm,\n", + " vectara,\n", + " document_content_description,\n", + " metadata_field_info,\n", + " enable_limit=True,\n", + " verbose=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2758d229-4f97-499c-819f-888acaf8ee10", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='dinosaur' filter=None limit=2\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'lang': 'eng', 'offset': '0', 'len': '66', 'year': '1993', 'rating': '7.7', 'genre': 'science fiction', 'source': 'langchain'}),\n", + " Document(page_content='Toys come alive and have a blast doing so', metadata={'lang': 'eng', 'offset': '0', 'len': '41', 'year': '1995', 'genre': 'animated', 'source': 'langchain'})]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a relevant query\n", + "retriever.get_relevant_documents(\"what are two movies about dinosaurs\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/langchain/langchain/retrievers/self_query/base.py b/libs/langchain/langchain/retrievers/self_query/base.py index 0251bff52c..a34c756bd4 100644 --- a/libs/langchain/langchain/retrievers/self_query/base.py +++ b/libs/langchain/langchain/retrievers/self_query/base.py @@ -16,6 +16,7 @@ from langchain.retrievers.self_query.milvus import MilvusTranslator from langchain.retrievers.self_query.myscale import MyScaleTranslator from langchain.retrievers.self_query.pinecone import PineconeTranslator from langchain.retrievers.self_query.qdrant import QdrantTranslator +from langchain.retrievers.self_query.vectara import VectaraTranslator from langchain.retrievers.self_query.weaviate import WeaviateTranslator from langchain.schema import BaseRetriever, Document from langchain.schema.language_model import BaseLanguageModel @@ -28,6 +29,7 @@ from langchain.vectorstores import ( MyScale, Pinecone, Qdrant, + Vectara, VectorStore, Weaviate, ) @@ -41,6 +43,7 @@ def _get_builtin_translator(vectorstore: VectorStore) -> Visitor: Chroma: ChromaTranslator, DashVector: DashvectorTranslator, Weaviate: WeaviateTranslator, + Vectara: VectaraTranslator, Qdrant: QdrantTranslator, MyScale: MyScaleTranslator, DeepLake: DeepLakeTranslator, diff --git a/libs/langchain/langchain/retrievers/self_query/vectara.py b/libs/langchain/langchain/retrievers/self_query/vectara.py new file mode 100644 index 0000000000..73dc46ff59 --- /dev/null +++ b/libs/langchain/langchain/retrievers/self_query/vectara.py @@ -0,0 +1,69 @@ +from typing import Tuple, Union + +from langchain.chains.query_constructor.ir import ( + Comparator, + Comparison, + Operation, + Operator, + StructuredQuery, + Visitor, +) + + +def process_value(value: Union[int, float, str]) -> str: + if isinstance(value, str): + return f"'{value}'" + else: + return str(value) + + +class VectaraTranslator(Visitor): + """Translate `Vectara` internal query language elements to valid filters.""" + + allowed_operators = [Operator.AND, Operator.OR] + """Subset of allowed logical operators.""" + allowed_comparators = [ + Comparator.EQ, + Comparator.NE, + Comparator.GT, + Comparator.GTE, + Comparator.LT, + Comparator.LTE, + ] + """Subset of allowed logical comparators.""" + + def _format_func(self, func: Union[Operator, Comparator]) -> str: + map_dict = { + Operator.AND: " and ", + Operator.OR: " or ", + Comparator.EQ: "=", + Comparator.NE: "!=", + Comparator.GT: ">", + Comparator.GTE: ">=", + Comparator.LT: "<", + Comparator.LTE: "<=", + } + self._validate_func(func) + return map_dict[func] + + def visit_operation(self, operation: Operation) -> str: + args = [arg.accept(self) for arg in operation.arguments] + operator = self._format_func(operation.operator) + return "( " + operator.join(args) + " )" + + def visit_comparison(self, comparison: Comparison) -> str: + comparator = self._format_func(comparison.comparator) + processed_value = process_value(comparison.value) + attribute = comparison.attribute + return ( + "( " + "doc." + attribute + " " + comparator + " " + processed_value + " )" + ) + + def visit_structured_query( + self, structured_query: StructuredQuery + ) -> Tuple[str, dict]: + if structured_query.filter is None: + kwargs = {} + else: + kwargs = {"filter": structured_query.filter.accept(self)} + return structured_query.query, kwargs diff --git a/libs/langchain/langchain/vectorstores/vectara.py b/libs/langchain/langchain/vectorstores/vectara.py index 457511b104..3e8a2549e2 100644 --- a/libs/langchain/langchain/vectorstores/vectara.py +++ b/libs/langchain/langchain/vectorstores/vectara.py @@ -396,8 +396,12 @@ class Vectara(VectorStore): vectara_api_key=api_key, ) """ - # Note: Vectara generates its own embeddings, so we ignore the provided - # embeddings (required by interface) + # Notes: + # * Vectara generates its own embeddings, so we ignore the provided + # embeddings (required by interface) + # * when metadatas[] are provided they are associated with each "part" + # in Vectara. doc_metadata can be used to provide additional metadata + # for the document itself (applies to all "texts" in this call) doc_metadata = kwargs.pop("doc_metadata", {}) vectara = cls(**kwargs) vectara.add_texts(texts, metadatas, doc_metadata=doc_metadata, **kwargs) diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_polars_dataframe.py b/libs/langchain/tests/integration_tests/document_loaders/test_polars_dataframe.py index 03f5070120..2858b41e8e 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_polars_dataframe.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_polars_dataframe.py @@ -34,8 +34,6 @@ def test_load_returns_list_of_documents(sample_data_frame: pl.DataFrame) -> None def test_load_converts_dataframe_columns_to_document_metadata( sample_data_frame: pl.DataFrame, ) -> None: - import polars as pl - loader = PolarsDataFrameLoader(sample_data_frame) docs = loader.load() diff --git a/libs/langchain/tests/unit_tests/retrievers/self_query/test_vectara.py b/libs/langchain/tests/unit_tests/retrievers/self_query/test_vectara.py new file mode 100644 index 0000000000..05c15f26ac --- /dev/null +++ b/libs/langchain/tests/unit_tests/retrievers/self_query/test_vectara.py @@ -0,0 +1,71 @@ +from typing import Dict, Tuple + +from langchain.chains.query_constructor.ir import ( + Comparator, + Comparison, + Operation, + Operator, + StructuredQuery, +) +from langchain.retrievers.self_query.vectara import VectaraTranslator + +DEFAULT_TRANSLATOR = VectaraTranslator() + + +def test_visit_comparison() -> None: + comp = Comparison(comparator=Comparator.LT, attribute="foo", value="1") + expected = "( doc.foo < '1' )" + actual = DEFAULT_TRANSLATOR.visit_comparison(comp) + assert expected == actual + + +def test_visit_operation() -> None: + op = Operation( + operator=Operator.AND, + arguments=[ + Comparison(comparator=Comparator.LT, attribute="foo", value=2), + Comparison(comparator=Comparator.EQ, attribute="bar", value="baz"), + Comparison(comparator=Comparator.LT, attribute="abc", value=1), + ], + ) + expected = "( ( doc.foo < 2 ) and ( doc.bar = 'baz' ) and ( doc.abc < 1 ) )" + actual = DEFAULT_TRANSLATOR.visit_operation(op) + assert expected == actual + + +def test_visit_structured_query() -> None: + query = "What is the capital of France?" + structured_query = StructuredQuery( + query=query, + filter=None, + limit=None, + ) + expected: Tuple[str, Dict] = (query, {}) + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual + + comp = Comparison(comparator=Comparator.LT, attribute="foo", value=1) + expected = (query, {"filter": "( doc.foo < 1 )"}) + structured_query = StructuredQuery( + query=query, + filter=comp, + limit=None, + ) + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual + + op = Operation( + operator=Operator.AND, + arguments=[ + Comparison(comparator=Comparator.LT, attribute="foo", value=2), + Comparison(comparator=Comparator.EQ, attribute="bar", value="baz"), + Comparison(comparator=Comparator.LT, attribute="abc", value=1), + ], + ) + structured_query = StructuredQuery(query=query, filter=op, limit=None) + expected = ( + query, + {"filter": "( ( doc.foo < 2 ) and ( doc.bar = 'baz' ) and ( doc.abc < 1 ) )"}, + ) + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual From 274c3dc3a82d5248859e09bccec1acef682335a4 Mon Sep 17 00:00:00 2001 From: maks-operlejn-ds <142261444+maks-operlejn-ds@users.noreply.github.com> Date: Thu, 7 Sep 2023 23:42:24 +0200 Subject: [PATCH 03/13] Multilingual anonymization (#10327) ### Description Add multiple language support to Anonymizer PII detection in Microsoft Presidio relies on several components - in addition to the usual pattern matching (e.g. using regex), the analyser uses a model for Named Entity Recognition (NER) to extract entities such as: - `PERSON` - `LOCATION` - `DATE_TIME` - `NRP` - `ORGANIZATION` [[Source]](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py) To handle NER in specific languages, we utilize unique models from the `spaCy` library, recognized for its extensive selection covering multiple languages and sizes. However, it's not restrictive, allowing for integration of alternative frameworks such as [Stanza](https://microsoft.github.io/presidio/analyzer/nlp_engines/spacy_stanza/) or [transformers](https://microsoft.github.io/presidio/analyzer/nlp_engines/transformers/) when necessary. ### Future works - **automatic language detection** - instead of passing the language as a parameter in `anonymizer.anonymize`, we could detect the language/s beforehand and then use the corresponding NER model. We have discussed this internally and @mateusz-wosinski-ds will look into a standalone language detection tool/chain for LangChain :smile: ### Twitter handle @deepsense_ai / @MaksOpp ### Tag maintainer @baskaryan @hwchase17 @hinthornw --- ...b => 01_presidio_data_anonymization.ipynb} | 4 +- ...02_presidio_reversible_anonymization.ipynb | 461 ++++++++++++++++ ...residio_multi_language_anonymization.ipynb | 520 ++++++++++++++++++ .../presidio_reversible_anonymization.ipynb | 461 ---------------- .../data_anonymizer/base.py | 7 +- .../data_anonymizer/faker_presidio_mapping.py | 4 +- .../data_anonymizer/presidio.py | 71 ++- 7 files changed, 1053 insertions(+), 475 deletions(-) rename docs/extras/guides/privacy/{presidio_data_anonymization.ipynb => 01_presidio_data_anonymization.ipynb} (97%) create mode 100644 docs/extras/guides/privacy/02_presidio_reversible_anonymization.ipynb create mode 100644 docs/extras/guides/privacy/03_presidio_multi_language_anonymization.ipynb delete mode 100644 docs/extras/guides/privacy/presidio_reversible_anonymization.ipynb diff --git a/docs/extras/guides/privacy/presidio_data_anonymization.ipynb b/docs/extras/guides/privacy/01_presidio_data_anonymization.ipynb similarity index 97% rename from docs/extras/guides/privacy/presidio_data_anonymization.ipynb rename to docs/extras/guides/privacy/01_presidio_data_anonymization.ipynb index 4b4b718e29..c06157c118 100644 --- a/docs/extras/guides/privacy/presidio_data_anonymization.ipynb +++ b/docs/extras/guides/privacy/01_presidio_data_anonymization.ipynb @@ -6,7 +6,7 @@ "source": [ "# Data anonymization with Microsoft Presidio\n", "\n", - "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/presidio_data_anonymization.ipynb)\n", + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/01_presidio_data_anonymization.ipynb)\n", "\n", "## Use case\n", "\n", @@ -439,8 +439,6 @@ "metadata": {}, "source": [ "## Future works\n", - "\n", - "- **deanonymization** - add the ability to reverse anonymization. For example, the workflow could look like this: `anonymize -> LLMChain -> deanonymize`. By doing this, we will retain anonymity in requests to, for example, OpenAI, and then be able restore the original data.\n", "- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object." ] } diff --git a/docs/extras/guides/privacy/02_presidio_reversible_anonymization.ipynb b/docs/extras/guides/privacy/02_presidio_reversible_anonymization.ipynb new file mode 100644 index 0000000000..4c75523969 --- /dev/null +++ b/docs/extras/guides/privacy/02_presidio_reversible_anonymization.ipynb @@ -0,0 +1,461 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reversible data anonymization with Microsoft Presidio\n", + "\n", + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/02_presidio_reversible_anonymization.ipynb)\n", + "\n", + "\n", + "## Use case\n", + "\n", + "We have already written about the importance of anonymizing sensitive data in the previous section. **Reversible Anonymization** is an equally essential technology while sharing information with language models, as it balances data protection with data usability. This technique involves masking sensitive personally identifiable information (PII), yet it can be reversed and original data can be restored when authorized users need it. Its main advantage lies in the fact that while it conceals individual identities to prevent misuse, it also allows the concealed data to be accurately unmasked should it be necessary for legal or compliance purposes. \n", + "\n", + "## Overview\n", + "\n", + "We implemented the `PresidioReversibleAnonymizer`, which consists of two parts:\n", + "\n", + "1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example:\n", + "```\n", + " {\n", + " \"PERSON\": {\n", + " \"\": \"\",\n", + " \"John Doe\": \"Slim Shady\"\n", + " },\n", + " \"PHONE_NUMBER\": {\n", + " \"111-111-1111\": \"555-555-5555\"\n", + " }\n", + " ...\n", + " }\n", + "```\n", + "\n", + "2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it.\n", + "\n", + "Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM.\n", + "\n", + "## Quickstart\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Install necessary packages\n", + "# ! pip install langchain langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker\n", + "# ! python -m spacy download en_core_web_lg" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`PresidioReversibleAnonymizer` is not significantly different from its predecessor (`PresidioAnonymizer`) in terms of anonymization:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'My name is Maria Lynch, call me at 7344131647 or email me at jamesmichael@example.com. By the way, my card number is: 4838637940262'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n", + "\n", + "anonymizer = PresidioReversibleAnonymizer(\n", + " analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n", + " # Faker seed is used here to make sure the same fake data is generated for the test purposes\n", + " # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n", + " faker_seed=42,\n", + ")\n", + "\n", + "anonymizer.anonymize(\n", + " \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n", + " \"By the way, my card number is: 4916 0387 9536 0861\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is what the full string we want to deanonymize looks like:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Maria Lynch recently lost his wallet. \n", + "Inside is some cash and his credit card with the number 4838637940262. \n", + "If you would find it, please call at 7344131647 or write an email here: jamesmichael@example.com.\n", + "Maria Lynch would be very grateful!\n" + ] + } + ], + "source": [ + "# We know this data, as we set the faker_seed parameter\n", + "fake_name = \"Maria Lynch\"\n", + "fake_phone = \"7344131647\"\n", + "fake_email = \"jamesmichael@example.com\"\n", + "fake_credit_card = \"4838637940262\"\n", + "\n", + "anonymized_text = f\"\"\"{fake_name} recently lost his wallet. \n", + "Inside is some cash and his credit card with the number {fake_credit_card}. \n", + "If you would find it, please call at {fake_phone} or write an email here: {fake_email}.\n", + "{fake_name} would be very grateful!\"\"\"\n", + "\n", + "print(anonymized_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now, using the `deanonymize` method, we can reverse the process:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Slim Shady recently lost his wallet. \n", + "Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n", + "If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\n", + "Slim Shady would be very grateful!\n" + ] + } + ], + "source": [ + "print(anonymizer.deanonymize(anonymized_text))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using with LangChain Expression Language\n", + "\n", + "With LCEL we can easily chain together anonymization and deanonymization with the rest of our application. This is an example of using the anonymization mechanism with a query to LLM (without deanonymization for now):" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "text = f\"\"\"Slim Shady recently lost his wallet. \n", + "Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n", + "If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dear Sir/Madam,\n", + "\n", + "We regret to inform you that Mr. Dana Rhodes has reported the loss of his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4397528473885757. \n", + "\n", + "If you happen to come across the aforementioned wallet, we kindly request that you contact us immediately at 258-481-7074x714 or via email at laurengoodman@example.com.\n", + "\n", + "Your prompt assistance in this matter would be greatly appreciated.\n", + "\n", + "Yours faithfully,\n", + "\n", + "[Your Name]\n" + ] + } + ], + "source": [ + "from langchain.prompts.prompt import PromptTemplate\n", + "from langchain.chat_models import ChatOpenAI\n", + "\n", + "anonymizer = PresidioReversibleAnonymizer()\n", + "\n", + "template = \"\"\"Rewrite this text into an official, short email:\n", + "\n", + "{anonymized_text}\"\"\"\n", + "prompt = PromptTemplate.from_template(template)\n", + "llm = ChatOpenAI(temperature=0)\n", + "\n", + "chain = {\"anonymized_text\": anonymizer.anonymize} | prompt | llm\n", + "response = chain.invoke(text)\n", + "print(response.content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's add **deanonymization step** to our sequence:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dear Sir/Madam,\n", + "\n", + "We regret to inform you that Mr. Slim Shady has recently misplaced his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4916 0387 9536 0861. \n", + "\n", + "If by any chance you come across the lost wallet, kindly contact us immediately at 313-666-7440 or send an email to real.slim.shady@gmail.com.\n", + "\n", + "Your prompt assistance in this matter would be greatly appreciated.\n", + "\n", + "Yours faithfully,\n", + "\n", + "[Your Name]\n" + ] + } + ], + "source": [ + "chain = chain | (lambda ai_message: anonymizer.deanonymize(ai_message.content))\n", + "response = chain.invoke(text)\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Anonymized data was given to the model itself, and therefore it was protected from being leaked to the outside world. Then, the model's response was processed, and the factual value was replaced with the real one." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Extra knowledge" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`PresidioReversibleAnonymizer` stores the mapping of the fake values to the original values in the `deanonymizer_mapping` parameter, where key is fake PII and value is the original one: " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'PERSON': {'Maria Lynch': 'Slim Shady'},\n", + " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", + " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", + " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861'}}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n", + "\n", + "anonymizer = PresidioReversibleAnonymizer(\n", + " analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n", + " # Faker seed is used here to make sure the same fake data is generated for the test purposes\n", + " # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n", + " faker_seed=42,\n", + ")\n", + "\n", + "anonymizer.anonymize(\n", + " \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n", + " \"By the way, my card number is: 4916 0387 9536 0861\"\n", + ")\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Anonymizing more texts will result in new mapping entries:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Do you have his VISA card number? Yep, it's 3537672423884966. I'm William Bowman by the way.\n" + ] + }, + { + "data": { + "text/plain": [ + "{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n", + " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", + " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", + " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n", + " '3537672423884966': '4001 9192 5753 7193'}}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\n", + " anonymizer.anonymize(\n", + " \"Do you have his VISA card number? Yep, it's 4001 9192 5753 7193. I'm John Doe by the way.\"\n", + " )\n", + ")\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can save the mapping itself to a file for future use: " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# We can save the deanonymizer mapping as a JSON or YAML file\n", + "\n", + "anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n", + "# anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.yaml\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then, load it in another `PresidioReversibleAnonymizer` instance:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymizer = PresidioReversibleAnonymizer()\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n", + " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", + " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", + " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n", + " '3537672423884966': '4001 9192 5753 7193'}}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymizer.load_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Future works\n", + "\n", + "- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object.\n", + "- **better matching and substitution of fake values for real ones** - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. *John Doe* -> *John* or *Main St, New York* -> *New York*) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/docs/extras/guides/privacy/03_presidio_multi_language_anonymization.ipynb b/docs/extras/guides/privacy/03_presidio_multi_language_anonymization.ipynb new file mode 100644 index 0000000000..c6c144ebae --- /dev/null +++ b/docs/extras/guides/privacy/03_presidio_multi_language_anonymization.ipynb @@ -0,0 +1,520 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Mutli-language data anonymization with Microsoft Presidio\n", + "\n", + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/03_presidio_multi_language_anonymization.ipynb)\n", + "\n", + "\n", + "## Use case\n", + "\n", + "Multi-language support in data pseudonymization is essential due to differences in language structures and cultural contexts. Different languages may have varying formats for personal identifiers. For example, the structure of names, locations and dates can differ greatly between languages and regions. Furthermore, non-alphanumeric characters, accents, and the direction of writing can impact pseudonymization processes. Without multi-language support, data could remain identifiable or be misinterpreted, compromising data privacy and accuracy. Hence, it enables effective and precise pseudonymization suited for global operations.\n", + "\n", + "## Overview\n", + "\n", + "PII detection in Microsoft Presidio relies on several components - in addition to the usual pattern matching (e.g. using regex), the analyser uses a model for Named Entity Recognition (NER) to extract entities such as:\n", + "- `PERSON`\n", + "- `LOCATION`\n", + "- `DATE_TIME`\n", + "- `NRP`\n", + "- `ORGANIZATION`\n", + "\n", + "[[Source]](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py)\n", + "\n", + "To handle NER in specific languages, we utilize unique models from the `spaCy` library, recognized for its extensive selection covering multiple languages and sizes. However, it's not restrictive, allowing for integration of alternative frameworks such as [Stanza](https://microsoft.github.io/presidio/analyzer/nlp_engines/spacy_stanza/) or [transformers](https://microsoft.github.io/presidio/analyzer/nlp_engines/transformers/) when necessary.\n", + "\n", + "\n", + "## Quickstart\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Install necessary packages\n", + "# ! pip install langchain langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker\n", + "# ! python -m spacy download en_core_web_lg" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n", + "\n", + "anonymizer = PresidioReversibleAnonymizer(\n", + " analyzed_fields=[\"PERSON\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, `PresidioAnonymizer` and `PresidioReversibleAnonymizer` use a model trained on English texts, so they handle other languages moderately well. \n", + "\n", + "For example, here the model did not detect the person:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Me llamo Sofía'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymizer.anonymize(\"Me llamo Sofía\") # \"My name is Sofía\" in Spanish" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "They may also take words from another language as actual entities. Here, both the word *'Yo'* (*'I'* in Spanish) and *Sofía* have been classified as `PERSON`:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Bridget Kirk soy Sally Knight'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymizer.anonymize(\"Yo soy Sofía\") # \"I am Sofía\" in Spanish" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to anonymise texts from other languages, you need to download other models and add them to the anonymiser configuration:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Download the models for the languages you want to use\n", + "# ! python -m spacy download en_core_web_md\n", + "# ! python -m spacy download es_core_news_md" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "nlp_config = {\n", + " \"nlp_engine_name\": \"spacy\",\n", + " \"models\": [\n", + " {\"lang_code\": \"en\", \"model_name\": \"en_core_web_md\"},\n", + " {\"lang_code\": \"es\", \"model_name\": \"es_core_news_md\"},\n", + " ],\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have therefore added a Spanish language model. Note also that we have downloaded an alternative model for English as well - in this case we have replaced the large model `en_core_web_lg` (560MB) with its smaller version `en_core_web_md` (40MB) - the size is therefore reduced by 14 times! If you care about the speed of anonymisation, it is worth considering it.\n", + "\n", + "All models for the different languages can be found in the [spaCy documentation](https://spacy.io/usage/models).\n", + "\n", + "Now pass the configuration as the `languages_config` parameter to Anonymiser. As you can see, both previous examples work flawlessly:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Me llamo Michelle Smith\n", + "Yo soy Rachel Wright\n" + ] + } + ], + "source": [ + "anonymizer = PresidioReversibleAnonymizer(\n", + " analyzed_fields=[\"PERSON\"],\n", + " languages_config=nlp_config,\n", + ")\n", + "\n", + "print(\n", + " anonymizer.anonymize(\"Me llamo Sofía\", language=\"es\")\n", + ") # \"My name is Sofía\" in Spanish\n", + "print(anonymizer.anonymize(\"Yo soy Sofía\", language=\"es\")) # \"I am Sofía\" in Spanish" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, the language indicated first in the configuration will be used when anonymising text (in this case English):" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "My name is Ronnie Ayala\n" + ] + } + ], + "source": [ + "print(anonymizer.anonymize(\"My name is John\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Advanced usage\n", + "\n", + "### Custom labels in NER model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It may be that the spaCy model has different class names than those supported by the Microsoft Presidio by default. Take Polish, for example:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Text: Wiktoria, Start: 12, End: 20, Label: persName\n" + ] + } + ], + "source": [ + "# ! python -m spacy download pl_core_news_md\n", + "\n", + "import spacy\n", + "\n", + "nlp = spacy.load(\"pl_core_news_md\")\n", + "doc = nlp(\"Nazywam się Wiktoria\") # \"My name is Wiktoria\" in Polish\n", + "\n", + "for ent in doc.ents:\n", + " print(\n", + " f\"Text: {ent.text}, Start: {ent.start_char}, End: {ent.end_char}, Label: {ent.label_}\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The name *Victoria* was classified as `persName`, which does not correspond to the default class names `PERSON`/`PER` implemented in Microsoft Presidio (look for `CHECK_LABEL_GROUPS` in [SpacyRecognizer implementation](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py)). \n", + "\n", + "You can find out more about custom labels in spaCy models (including your own, trained ones) in [this thread](https://github.com/microsoft/presidio/issues/851).\n", + "\n", + "That's why our sentence will not be anonymized:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nazywam się Wiktoria\n" + ] + } + ], + "source": [ + "nlp_config = {\n", + " \"nlp_engine_name\": \"spacy\",\n", + " \"models\": [\n", + " {\"lang_code\": \"en\", \"model_name\": \"en_core_web_md\"},\n", + " {\"lang_code\": \"es\", \"model_name\": \"es_core_news_md\"},\n", + " {\"lang_code\": \"pl\", \"model_name\": \"pl_core_news_md\"},\n", + " ],\n", + "}\n", + "\n", + "anonymizer = PresidioReversibleAnonymizer(\n", + " analyzed_fields=[\"PERSON\", \"LOCATION\", \"DATE_TIME\"],\n", + " languages_config=nlp_config,\n", + ")\n", + "\n", + "print(\n", + " anonymizer.anonymize(\"Nazywam się Wiktoria\", language=\"pl\")\n", + ") # \"My name is Wiktoria\" in Polish" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To address this, create your own `SpacyRecognizer` with your own class mapping and add it to the anonymizer:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from presidio_analyzer.predefined_recognizers import SpacyRecognizer\n", + "\n", + "polish_check_label_groups = [\n", + " ({\"LOCATION\"}, {\"placeName\", \"geogName\"}),\n", + " ({\"PERSON\"}, {\"persName\"}),\n", + " ({\"DATE_TIME\"}, {\"date\", \"time\"}),\n", + "]\n", + "\n", + "spacy_recognizer = SpacyRecognizer(\n", + " supported_language=\"pl\",\n", + " check_label_groups=polish_check_label_groups,\n", + ")\n", + "\n", + "anonymizer.add_recognizer(spacy_recognizer)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now everything works smoothly:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nazywam się Morgan Walters\n" + ] + } + ], + "source": [ + "print(\n", + " anonymizer.anonymize(\"Nazywam się Wiktoria\", language=\"pl\")\n", + ") # \"My name is Wiktoria\" in Polish" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's try on more complex example:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nazywam się Ernest Liu. New Taylorburgh to moje miasto rodzinne. Urodziłam się 1987-01-19\n" + ] + } + ], + "source": [ + "print(\n", + " anonymizer.anonymize(\n", + " \"Nazywam się Wiktoria. Płock to moje miasto rodzinne. Urodziłam się dnia 6 kwietnia 2001 roku\",\n", + " language=\"pl\",\n", + " )\n", + ") # \"My name is Wiktoria. Płock is my home town. I was born on 6 April 2001\" in Polish" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see, thanks to class mapping, the anonymiser can cope with different types of entities. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Custom language-specific operators\n", + "\n", + "In the example above, the sentence has been anonymised correctly, but the fake data does not fit the Polish language at all. Custom operators can therefore be added, which will resolve the issue:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from faker import Faker\n", + "from presidio_anonymizer.entities import OperatorConfig\n", + "\n", + "fake = Faker(locale=\"pl_PL\") # Setting faker to provide Polish data\n", + "\n", + "new_operators = {\n", + " \"PERSON\": OperatorConfig(\"custom\", {\"lambda\": lambda _: fake.first_name_female()}),\n", + " \"LOCATION\": OperatorConfig(\"custom\", {\"lambda\": lambda _: fake.city()}),\n", + "}\n", + "\n", + "anonymizer.add_operators(new_operators)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nazywam się Marianna. Szczecin to moje miasto rodzinne. Urodziłam się 1976-11-16\n" + ] + } + ], + "source": [ + "print(\n", + " anonymizer.anonymize(\n", + " \"Nazywam się Wiktoria. Płock to moje miasto rodzinne. Urodziłam się dnia 6 kwietnia 2001 roku\",\n", + " language=\"pl\",\n", + " )\n", + ") # \"My name is Wiktoria. Płock is my home town. I was born on 6 April 2001\" in Polish" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Limitations\n", + "\n", + "Remember - results are as good as your recognizers and as your NER models!\n", + "\n", + "Look at the example below - we downloaded the small model for Spanish (12MB) and it no longer performs as well as the medium version (40MB):" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: es_core_news_sm. Result: Me llamo Sofía\n", + "Model: es_core_news_md. Result: Me llamo Lawrence Davis\n" + ] + } + ], + "source": [ + "# ! python -m spacy download es_core_news_sm\n", + "\n", + "for model in [\"es_core_news_sm\", \"es_core_news_md\"]:\n", + " nlp_config = {\n", + " \"nlp_engine_name\": \"spacy\",\n", + " \"models\": [\n", + " {\"lang_code\": \"es\", \"model_name\": model},\n", + " ],\n", + " }\n", + "\n", + " anonymizer = PresidioReversibleAnonymizer(\n", + " analyzed_fields=[\"PERSON\"],\n", + " languages_config=nlp_config,\n", + " )\n", + "\n", + " print(\n", + " f\"Model: {model}. Result: {anonymizer.anonymize('Me llamo Sofía', language='es')}\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In many cases, even the larger models from spaCy will not be sufficient - there are already other, more complex and better methods of detecting named entities, based on transformers. You can read more about this [here](https://microsoft.github.io/presidio/analyzer/nlp_engines/transformers/)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Future works\n", + "\n", + "- **automatic language detection** - instead of passing the language as a parameter in `anonymizer.anonymize`, we could detect the language/s beforehand and then use the corresponding NER model." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/extras/guides/privacy/presidio_reversible_anonymization.ipynb b/docs/extras/guides/privacy/presidio_reversible_anonymization.ipynb deleted file mode 100644 index 480b263278..0000000000 --- a/docs/extras/guides/privacy/presidio_reversible_anonymization.ipynb +++ /dev/null @@ -1,461 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Reversible data anonymization with Microsoft Presidio\n", - "\n", - "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/presidio_reversible_anonymization.ipynb)\n", - "\n", - "\n", - "## Use case\n", - "\n", - "We have already written about the importance of anonymizing sensitive data in the previous section. **Reversible Anonymization** is an equally essential technology while sharing information with language models, as it balances data protection with data usability. This technique involves masking sensitive personally identifiable information (PII), yet it can be reversed and original data can be restored when authorized users need it. Its main advantage lies in the fact that while it conceals individual identities to prevent misuse, it also allows the concealed data to be accurately unmasked should it be necessary for legal or compliance purposes. \n", - "\n", - "## Overview\n", - "\n", - "We implemented the `PresidioReversibleAnonymizer`, which consists of two parts:\n", - "\n", - "1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example:\n", - "```\n", - " {\n", - " \"PERSON\": {\n", - " \"\": \"\",\n", - " \"John Doe\": \"Slim Shady\"\n", - " },\n", - " \"PHONE_NUMBER\": {\n", - " \"111-111-1111\": \"555-555-5555\"\n", - " }\n", - " ...\n", - " }\n", - "```\n", - "\n", - "2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it.\n", - "\n", - "Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM.\n", - "\n", - "## Quickstart\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# Install necessary packages\n", - "# ! pip install langchain langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker\n", - "# ! python -m spacy download en_core_web_lg" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`PresidioReversibleAnonymizer` is not significantly different from its predecessor (`PresidioAnonymizer`) in terms of anonymization:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'My name is Maria Lynch, call me at 7344131647 or email me at jamesmichael@example.com. By the way, my card number is: 4838637940262'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n", - "\n", - "anonymizer = PresidioReversibleAnonymizer(\n", - " analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n", - " # Faker seed is used here to make sure the same fake data is generated for the test purposes\n", - " # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n", - " faker_seed=42,\n", - ")\n", - "\n", - "anonymizer.anonymize(\n", - " \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n", - " \"By the way, my card number is: 4916 0387 9536 0861\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is what the full string we want to deanonymize looks like:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Maria Lynch recently lost his wallet. \n", - "Inside is some cash and his credit card with the number 4838637940262. \n", - "If you would find it, please call at 7344131647 or write an email here: jamesmichael@example.com.\n", - "Maria Lynch would be very grateful!\n" - ] - } - ], - "source": [ - "# We know this data, as we set the faker_seed parameter\n", - "fake_name = \"Maria Lynch\"\n", - "fake_phone = \"7344131647\"\n", - "fake_email = \"jamesmichael@example.com\"\n", - "fake_credit_card = \"4838637940262\"\n", - "\n", - "anonymized_text = f\"\"\"{fake_name} recently lost his wallet. \n", - "Inside is some cash and his credit card with the number {fake_credit_card}. \n", - "If you would find it, please call at {fake_phone} or write an email here: {fake_email}.\n", - "{fake_name} would be very grateful!\"\"\"\n", - "\n", - "print(anonymized_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And now, using the `deanonymize` method, we can reverse the process:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Slim Shady recently lost his wallet. \n", - "Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n", - "If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\n", - "Slim Shady would be very grateful!\n" - ] - } - ], - "source": [ - "print(anonymizer.deanonymize(anonymized_text))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using with LangChain Expression Language\n", - "\n", - "With LCEL we can easily chain together anonymization and deanonymization with the rest of our application. This is an example of using the anonymization mechanism with a query to LLM (without deanonymization for now):" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "text = f\"\"\"Slim Shady recently lost his wallet. \n", - "Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n", - "If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dear Sir/Madam,\n", - "\n", - "We regret to inform you that Mr. Dana Rhodes has reported the loss of his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4397528473885757. \n", - "\n", - "If you happen to come across the aforementioned wallet, we kindly request that you contact us immediately at 258-481-7074x714 or via email at laurengoodman@example.com.\n", - "\n", - "Your prompt assistance in this matter would be greatly appreciated.\n", - "\n", - "Yours faithfully,\n", - "\n", - "[Your Name]\n" - ] - } - ], - "source": [ - "from langchain.prompts.prompt import PromptTemplate\n", - "from langchain.chat_models import ChatOpenAI\n", - "\n", - "anonymizer = PresidioReversibleAnonymizer()\n", - "\n", - "template = \"\"\"Rewrite this text into an official, short email:\n", - "\n", - "{anonymized_text}\"\"\"\n", - "prompt = PromptTemplate.from_template(template)\n", - "llm = ChatOpenAI(temperature=0)\n", - "\n", - "chain = {\"anonymized_text\": anonymizer.anonymize} | prompt | llm\n", - "response = chain.invoke(text)\n", - "print(response.content)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, let's add **deanonymization step** to our sequence:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dear Sir/Madam,\n", - "\n", - "We regret to inform you that Mr. Slim Shady has recently misplaced his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4916 0387 9536 0861. \n", - "\n", - "If by any chance you come across the lost wallet, kindly contact us immediately at 313-666-7440 or send an email to real.slim.shady@gmail.com.\n", - "\n", - "Your prompt assistance in this matter would be greatly appreciated.\n", - "\n", - "Yours faithfully,\n", - "\n", - "[Your Name]\n" - ] - } - ], - "source": [ - "chain = chain | (lambda ai_message: anonymizer.deanonymize(ai_message.content))\n", - "response = chain.invoke(text)\n", - "print(response)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Anonymized data was given to the model itself, and therefore it was protected from being leaked to the outside world. Then, the model's response was processed, and the factual value was replaced with the real one." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Extra knowledge" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`PresidioReversibleAnonymizer` stores the mapping of the fake values to the original values in the `deanonymizer_mapping` parameter, where key is fake PII and value is the original one: " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'PERSON': {'Maria Lynch': 'Slim Shady'},\n", - " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", - " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", - " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861'}}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n", - "\n", - "anonymizer = PresidioReversibleAnonymizer(\n", - " analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n", - " # Faker seed is used here to make sure the same fake data is generated for the test purposes\n", - " # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n", - " faker_seed=42,\n", - ")\n", - "\n", - "anonymizer.anonymize(\n", - " \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n", - " \"By the way, my card number is: 4916 0387 9536 0861\"\n", - ")\n", - "\n", - "anonymizer.deanonymizer_mapping" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Anonymizing more texts will result in new mapping entries:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Do you have his VISA card number? Yep, it's 3537672423884966. I'm William Bowman by the way.\n" - ] - }, - { - "data": { - "text/plain": [ - "{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n", - " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", - " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", - " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n", - " '3537672423884966': '4001 9192 5753 7193'}}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print(\n", - " anonymizer.anonymize(\n", - " \"Do you have his VISA card number? Yep, it's 4001 9192 5753 7193. I'm John Doe by the way.\"\n", - " )\n", - ")\n", - "\n", - "anonymizer.deanonymizer_mapping" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can save the mapping itself to a file for future use: " - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# We can save the deanonymizer mapping as a JSON or YAML file\n", - "\n", - "anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n", - "# anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And then, load it in another `PresidioReversibleAnonymizer` instance:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{}" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "anonymizer = PresidioReversibleAnonymizer()\n", - "\n", - "anonymizer.deanonymizer_mapping" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n", - " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", - " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", - " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n", - " '3537672423884966': '4001 9192 5753 7193'}}" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "anonymizer.load_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n", - "\n", - "anonymizer.deanonymizer_mapping" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Future works\n", - "\n", - "- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object.\n", - "- **better matching and substitution of fake values for real ones** - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. *John Doe* -> *John* or *Main St, New York* -> *New York*) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/libs/experimental/langchain_experimental/data_anonymizer/base.py b/libs/experimental/langchain_experimental/data_anonymizer/base.py index 875032342a..292d2a2a0f 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/base.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/base.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from typing import Optional class AnonymizerBase(ABC): @@ -8,12 +9,12 @@ class AnonymizerBase(ABC): wrapping the behavior for all methods in a base class. """ - def anonymize(self, text: str) -> str: + def anonymize(self, text: str, language: Optional[str] = None) -> str: """Anonymize text""" - return self._anonymize(text) + return self._anonymize(text, language) @abstractmethod - def _anonymize(self, text: str) -> str: + def _anonymize(self, text: str, language: Optional[str]) -> str: """Abstract method to anonymize text""" diff --git a/libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py b/libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py index c2a339088e..9015679f20 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py @@ -27,8 +27,8 @@ def get_pseudoanonymizer_mapping(seed: Optional[int] = None) -> Dict[str, Callab fake.random_choices(string.ascii_lowercase + string.digits, length=26) ), "IP_ADDRESS": lambda _: fake.ipv4_public(), - "LOCATION": lambda _: fake.address(), - "DATE_TIME": lambda _: fake.iso8601(), + "LOCATION": lambda _: fake.city(), + "DATE_TIME": lambda _: fake.date(), "NRP": lambda _: str(fake.random_number(digits=8, fix_len=True)), "MEDICAL_LICENSE": lambda _: fake.bothify(text="??######").upper(), "URL": lambda _: fake.url(), diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py index d4886eb32c..b2be1dc5a1 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py @@ -24,6 +24,8 @@ from langchain_experimental.data_anonymizer.faker_presidio_mapping import ( try: from presidio_analyzer import AnalyzerEngine + from presidio_analyzer.nlp_engine import NlpEngineProvider + except ImportError as e: raise ImportError( "Could not import presidio_analyzer, please install with " @@ -44,12 +46,29 @@ if TYPE_CHECKING: from presidio_analyzer import EntityRecognizer, RecognizerResult from presidio_anonymizer.entities import EngineResult +# Configuring Anonymizer for multiple languages +# Detailed description and examples can be found here: +# langchain/docs/extras/guides/privacy/multi_language_anonymization.ipynb +DEFAULT_LANGUAGES_CONFIG = { + # You can also use Stanza or transformers library. + # See https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/ + "nlp_engine_name": "spacy", + "models": [ + {"lang_code": "en", "model_name": "en_core_web_lg"}, + # {"lang_code": "de", "model_name": "de_core_news_md"}, + # {"lang_code": "es", "model_name": "es_core_news_md"}, + # ... + # List of available models: https://spacy.io/usage/models + ], +} + class PresidioAnonymizerBase(AnonymizerBase): def __init__( self, analyzed_fields: Optional[List[str]] = None, operators: Optional[Dict[str, OperatorConfig]] = None, + languages_config: Dict = DEFAULT_LANGUAGES_CONFIG, faker_seed: Optional[int] = None, ): """ @@ -60,6 +79,11 @@ class PresidioAnonymizerBase(AnonymizerBase): Operators allow for custom anonymization of detected PII. Learn more: https://microsoft.github.io/presidio/tutorial/10_simple_anonymization/ + languages_config: Configuration for the NLP engine. + First language in the list will be used as the main language + in self.anonymize(...) when no language is specified. + Learn more: + https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/ faker_seed: Seed used to initialize faker. Defaults to None, in which case faker will be seeded randomly and provide random values. @@ -81,7 +105,15 @@ class PresidioAnonymizerBase(AnonymizerBase): ).items() } ) - self._analyzer = AnalyzerEngine() + + provider = NlpEngineProvider(nlp_configuration=languages_config) + nlp_engine = provider.create_engine() + + self.supported_languages = list(nlp_engine.nlp.keys()) + + self._analyzer = AnalyzerEngine( + supported_languages=self.supported_languages, nlp_engine=nlp_engine + ) self._anonymizer = AnonymizerEngine() def add_recognizer(self, recognizer: EntityRecognizer) -> None: @@ -103,18 +135,31 @@ class PresidioAnonymizerBase(AnonymizerBase): class PresidioAnonymizer(PresidioAnonymizerBase): - def _anonymize(self, text: str) -> str: + def _anonymize(self, text: str, language: Optional[str] = None) -> str: """Anonymize text. Each PII entity is replaced with a fake value. Each time fake values will be different, as they are generated randomly. Args: text: text to anonymize + language: language to use for analysis of PII + If None, the first (main) language in the list + of languages specified in the configuration will be used. """ + if language is None: + language = self.supported_languages[0] + + if language not in self.supported_languages: + raise ValueError( + f"Language '{language}' is not supported. " + f"Supported languages are: {self.supported_languages}. " + "Change your language configuration file to add more languages." + ) + results = self._analyzer.analyze( text, entities=self.analyzed_fields, - language="en", + language=language, ) return self._anonymizer.anonymize( @@ -129,9 +174,10 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB self, analyzed_fields: Optional[List[str]] = None, operators: Optional[Dict[str, OperatorConfig]] = None, + languages_config: Dict = DEFAULT_LANGUAGES_CONFIG, faker_seed: Optional[int] = None, ): - super().__init__(analyzed_fields, operators, faker_seed) + super().__init__(analyzed_fields, operators, languages_config, faker_seed) self._deanonymizer_mapping = DeanonymizerMapping() @property @@ -191,7 +237,7 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB self._deanonymizer_mapping.update(new_deanonymizer_mapping) - def _anonymize(self, text: str) -> str: + def _anonymize(self, text: str, language: Optional[str] = None) -> str: """Anonymize text. Each PII entity is replaced with a fake value. Each time fake values will be different, as they are generated randomly. @@ -200,11 +246,24 @@ class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerB Args: text: text to anonymize + language: language to use for analysis of PII + If None, the first (main) language in the list + of languages specified in the configuration will be used. """ + if language is None: + language = self.supported_languages[0] + + if language not in self.supported_languages: + raise ValueError( + f"Language '{language}' is not supported. " + f"Supported languages are: {self.supported_languages}. " + "Change your language configuration file to add more languages." + ) + analyzer_results = self._analyzer.analyze( text, entities=self.analyzed_fields, - language="en", + language=language, ) filtered_analyzer_results = ( From 1d2b6c3c67fdd5dbd3f9161001703fae40668a71 Mon Sep 17 00:00:00 2001 From: Bagatur Date: Thu, 7 Sep 2023 14:45:07 -0700 Subject: [PATCH 04/13] Reorganize presidio anonymization docs --- .../index.ipynb} | 0 .../multi_language.ipynb} | 0 .../reversible.ipynb} | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename docs/extras/guides/privacy/{01_presidio_data_anonymization.ipynb => presidio_data_anonymization/index.ipynb} (100%) rename docs/extras/guides/privacy/{03_presidio_multi_language_anonymization.ipynb => presidio_data_anonymization/multi_language.ipynb} (100%) rename docs/extras/guides/privacy/{02_presidio_reversible_anonymization.ipynb => presidio_data_anonymization/reversible.ipynb} (100%) diff --git a/docs/extras/guides/privacy/01_presidio_data_anonymization.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb similarity index 100% rename from docs/extras/guides/privacy/01_presidio_data_anonymization.ipynb rename to docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb diff --git a/docs/extras/guides/privacy/03_presidio_multi_language_anonymization.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization/multi_language.ipynb similarity index 100% rename from docs/extras/guides/privacy/03_presidio_multi_language_anonymization.ipynb rename to docs/extras/guides/privacy/presidio_data_anonymization/multi_language.ipynb diff --git a/docs/extras/guides/privacy/02_presidio_reversible_anonymization.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb similarity index 100% rename from docs/extras/guides/privacy/02_presidio_reversible_anonymization.ipynb rename to docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb From 41a25486113c51ad1ed9c348843fbc480242fe7f Mon Sep 17 00:00:00 2001 From: Bagatur Date: Thu, 7 Sep 2023 14:47:09 -0700 Subject: [PATCH 05/13] Fix presidio docs Colab links --- .../presidio_data_anonymization/index.ipynb | 4 +- .../multi_language.ipynb | 4 +- .../reversible.ipynb | 918 +++++++++--------- 3 files changed, 463 insertions(+), 463 deletions(-) diff --git a/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb index c06157c118..2502a45092 100644 --- a/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb +++ b/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb @@ -6,7 +6,7 @@ "source": [ "# Data anonymization with Microsoft Presidio\n", "\n", - "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/01_presidio_data_anonymization.ipynb)\n", + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb)\n", "\n", "## Use case\n", "\n", @@ -459,7 +459,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/extras/guides/privacy/presidio_data_anonymization/multi_language.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization/multi_language.ipynb index c6c144ebae..63ba8931a6 100644 --- a/docs/extras/guides/privacy/presidio_data_anonymization/multi_language.ipynb +++ b/docs/extras/guides/privacy/presidio_data_anonymization/multi_language.ipynb @@ -6,7 +6,7 @@ "source": [ "# Mutli-language data anonymization with Microsoft Presidio\n", "\n", - "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/03_presidio_multi_language_anonymization.ipynb)\n", + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/presidio_data_anonymization/multi_language.ipynb)\n", "\n", "\n", "## Use case\n", @@ -512,7 +512,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb index 4c75523969..de5655ba1e 100644 --- a/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb +++ b/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb @@ -1,461 +1,461 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Reversible data anonymization with Microsoft Presidio\n", - "\n", - "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/02_presidio_reversible_anonymization.ipynb)\n", - "\n", - "\n", - "## Use case\n", - "\n", - "We have already written about the importance of anonymizing sensitive data in the previous section. **Reversible Anonymization** is an equally essential technology while sharing information with language models, as it balances data protection with data usability. This technique involves masking sensitive personally identifiable information (PII), yet it can be reversed and original data can be restored when authorized users need it. Its main advantage lies in the fact that while it conceals individual identities to prevent misuse, it also allows the concealed data to be accurately unmasked should it be necessary for legal or compliance purposes. \n", - "\n", - "## Overview\n", - "\n", - "We implemented the `PresidioReversibleAnonymizer`, which consists of two parts:\n", - "\n", - "1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example:\n", - "```\n", - " {\n", - " \"PERSON\": {\n", - " \"\": \"\",\n", - " \"John Doe\": \"Slim Shady\"\n", - " },\n", - " \"PHONE_NUMBER\": {\n", - " \"111-111-1111\": \"555-555-5555\"\n", - " }\n", - " ...\n", - " }\n", - "```\n", - "\n", - "2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it.\n", - "\n", - "Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM.\n", - "\n", - "## Quickstart\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# Install necessary packages\n", - "# ! pip install langchain langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker\n", - "# ! python -m spacy download en_core_web_lg" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`PresidioReversibleAnonymizer` is not significantly different from its predecessor (`PresidioAnonymizer`) in terms of anonymization:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'My name is Maria Lynch, call me at 7344131647 or email me at jamesmichael@example.com. By the way, my card number is: 4838637940262'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n", - "\n", - "anonymizer = PresidioReversibleAnonymizer(\n", - " analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n", - " # Faker seed is used here to make sure the same fake data is generated for the test purposes\n", - " # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n", - " faker_seed=42,\n", - ")\n", - "\n", - "anonymizer.anonymize(\n", - " \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n", - " \"By the way, my card number is: 4916 0387 9536 0861\"\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This is what the full string we want to deanonymize looks like:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Maria Lynch recently lost his wallet. \n", - "Inside is some cash and his credit card with the number 4838637940262. \n", - "If you would find it, please call at 7344131647 or write an email here: jamesmichael@example.com.\n", - "Maria Lynch would be very grateful!\n" - ] - } - ], - "source": [ - "# We know this data, as we set the faker_seed parameter\n", - "fake_name = \"Maria Lynch\"\n", - "fake_phone = \"7344131647\"\n", - "fake_email = \"jamesmichael@example.com\"\n", - "fake_credit_card = \"4838637940262\"\n", - "\n", - "anonymized_text = f\"\"\"{fake_name} recently lost his wallet. \n", - "Inside is some cash and his credit card with the number {fake_credit_card}. \n", - "If you would find it, please call at {fake_phone} or write an email here: {fake_email}.\n", - "{fake_name} would be very grateful!\"\"\"\n", - "\n", - "print(anonymized_text)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And now, using the `deanonymize` method, we can reverse the process:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Slim Shady recently lost his wallet. \n", - "Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n", - "If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\n", - "Slim Shady would be very grateful!\n" - ] - } - ], - "source": [ - "print(anonymizer.deanonymize(anonymized_text))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Using with LangChain Expression Language\n", - "\n", - "With LCEL we can easily chain together anonymization and deanonymization with the rest of our application. This is an example of using the anonymization mechanism with a query to LLM (without deanonymization for now):" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "text = f\"\"\"Slim Shady recently lost his wallet. \n", - "Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n", - "If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dear Sir/Madam,\n", - "\n", - "We regret to inform you that Mr. Dana Rhodes has reported the loss of his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4397528473885757. \n", - "\n", - "If you happen to come across the aforementioned wallet, we kindly request that you contact us immediately at 258-481-7074x714 or via email at laurengoodman@example.com.\n", - "\n", - "Your prompt assistance in this matter would be greatly appreciated.\n", - "\n", - "Yours faithfully,\n", - "\n", - "[Your Name]\n" - ] - } - ], - "source": [ - "from langchain.prompts.prompt import PromptTemplate\n", - "from langchain.chat_models import ChatOpenAI\n", - "\n", - "anonymizer = PresidioReversibleAnonymizer()\n", - "\n", - "template = \"\"\"Rewrite this text into an official, short email:\n", - "\n", - "{anonymized_text}\"\"\"\n", - "prompt = PromptTemplate.from_template(template)\n", - "llm = ChatOpenAI(temperature=0)\n", - "\n", - "chain = {\"anonymized_text\": anonymizer.anonymize} | prompt | llm\n", - "response = chain.invoke(text)\n", - "print(response.content)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, let's add **deanonymization step** to our sequence:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Dear Sir/Madam,\n", - "\n", - "We regret to inform you that Mr. Slim Shady has recently misplaced his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4916 0387 9536 0861. \n", - "\n", - "If by any chance you come across the lost wallet, kindly contact us immediately at 313-666-7440 or send an email to real.slim.shady@gmail.com.\n", - "\n", - "Your prompt assistance in this matter would be greatly appreciated.\n", - "\n", - "Yours faithfully,\n", - "\n", - "[Your Name]\n" - ] - } - ], - "source": [ - "chain = chain | (lambda ai_message: anonymizer.deanonymize(ai_message.content))\n", - "response = chain.invoke(text)\n", - "print(response)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Anonymized data was given to the model itself, and therefore it was protected from being leaked to the outside world. Then, the model's response was processed, and the factual value was replaced with the real one." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Extra knowledge" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`PresidioReversibleAnonymizer` stores the mapping of the fake values to the original values in the `deanonymizer_mapping` parameter, where key is fake PII and value is the original one: " - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'PERSON': {'Maria Lynch': 'Slim Shady'},\n", - " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", - " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", - " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861'}}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n", - "\n", - "anonymizer = PresidioReversibleAnonymizer(\n", - " analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n", - " # Faker seed is used here to make sure the same fake data is generated for the test purposes\n", - " # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n", - " faker_seed=42,\n", - ")\n", - "\n", - "anonymizer.anonymize(\n", - " \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n", - " \"By the way, my card number is: 4916 0387 9536 0861\"\n", - ")\n", - "\n", - "anonymizer.deanonymizer_mapping" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Anonymizing more texts will result in new mapping entries:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Do you have his VISA card number? Yep, it's 3537672423884966. I'm William Bowman by the way.\n" - ] - }, - { - "data": { - "text/plain": [ - "{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n", - " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", - " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", - " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n", - " '3537672423884966': '4001 9192 5753 7193'}}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "print(\n", - " anonymizer.anonymize(\n", - " \"Do you have his VISA card number? Yep, it's 4001 9192 5753 7193. I'm John Doe by the way.\"\n", - " )\n", - ")\n", - "\n", - "anonymizer.deanonymizer_mapping" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can save the mapping itself to a file for future use: " - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "# We can save the deanonymizer mapping as a JSON or YAML file\n", - "\n", - "anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n", - "# anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.yaml\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And then, load it in another `PresidioReversibleAnonymizer` instance:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{}" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "anonymizer = PresidioReversibleAnonymizer()\n", - "\n", - "anonymizer.deanonymizer_mapping" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n", - " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", - " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", - " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n", - " '3537672423884966': '4001 9192 5753 7193'}}" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "anonymizer.load_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n", - "\n", - "anonymizer.deanonymizer_mapping" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Future works\n", - "\n", - "- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object.\n", - "- **better matching and substitution of fake values for real ones** - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. *John Doe* -> *John* or *Main St, New York* -> *New York*) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.4" - } + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reversible data anonymization with Microsoft Presidio\n", + "\n", + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb)\n", + "\n", + "\n", + "## Use case\n", + "\n", + "We have already written about the importance of anonymizing sensitive data in the previous section. **Reversible Anonymization** is an equally essential technology while sharing information with language models, as it balances data protection with data usability. This technique involves masking sensitive personally identifiable information (PII), yet it can be reversed and original data can be restored when authorized users need it. Its main advantage lies in the fact that while it conceals individual identities to prevent misuse, it also allows the concealed data to be accurately unmasked should it be necessary for legal or compliance purposes. \n", + "\n", + "## Overview\n", + "\n", + "We implemented the `PresidioReversibleAnonymizer`, which consists of two parts:\n", + "\n", + "1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example:\n", + "```\n", + " {\n", + " \"PERSON\": {\n", + " \"\": \"\",\n", + " \"John Doe\": \"Slim Shady\"\n", + " },\n", + " \"PHONE_NUMBER\": {\n", + " \"111-111-1111\": \"555-555-5555\"\n", + " }\n", + " ...\n", + " }\n", + "```\n", + "\n", + "2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it.\n", + "\n", + "Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM.\n", + "\n", + "## Quickstart\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Install necessary packages\n", + "# ! pip install langchain langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker\n", + "# ! python -m spacy download en_core_web_lg" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`PresidioReversibleAnonymizer` is not significantly different from its predecessor (`PresidioAnonymizer`) in terms of anonymization:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'My name is Maria Lynch, call me at 7344131647 or email me at jamesmichael@example.com. By the way, my card number is: 4838637940262'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n", + "\n", + "anonymizer = PresidioReversibleAnonymizer(\n", + " analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n", + " # Faker seed is used here to make sure the same fake data is generated for the test purposes\n", + " # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n", + " faker_seed=42,\n", + ")\n", + "\n", + "anonymizer.anonymize(\n", + " \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n", + " \"By the way, my card number is: 4916 0387 9536 0861\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is what the full string we want to deanonymize looks like:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Maria Lynch recently lost his wallet. \n", + "Inside is some cash and his credit card with the number 4838637940262. \n", + "If you would find it, please call at 7344131647 or write an email here: jamesmichael@example.com.\n", + "Maria Lynch would be very grateful!\n" + ] + } + ], + "source": [ + "# We know this data, as we set the faker_seed parameter\n", + "fake_name = \"Maria Lynch\"\n", + "fake_phone = \"7344131647\"\n", + "fake_email = \"jamesmichael@example.com\"\n", + "fake_credit_card = \"4838637940262\"\n", + "\n", + "anonymized_text = f\"\"\"{fake_name} recently lost his wallet. \n", + "Inside is some cash and his credit card with the number {fake_credit_card}. \n", + "If you would find it, please call at {fake_phone} or write an email here: {fake_email}.\n", + "{fake_name} would be very grateful!\"\"\"\n", + "\n", + "print(anonymized_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now, using the `deanonymize` method, we can reverse the process:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Slim Shady recently lost his wallet. \n", + "Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n", + "If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\n", + "Slim Shady would be very grateful!\n" + ] + } + ], + "source": [ + "print(anonymizer.deanonymize(anonymized_text))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using with LangChain Expression Language\n", + "\n", + "With LCEL we can easily chain together anonymization and deanonymization with the rest of our application. This is an example of using the anonymization mechanism with a query to LLM (without deanonymization for now):" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "text = f\"\"\"Slim Shady recently lost his wallet. \n", + "Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n", + "If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dear Sir/Madam,\n", + "\n", + "We regret to inform you that Mr. Dana Rhodes has reported the loss of his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4397528473885757. \n", + "\n", + "If you happen to come across the aforementioned wallet, we kindly request that you contact us immediately at 258-481-7074x714 or via email at laurengoodman@example.com.\n", + "\n", + "Your prompt assistance in this matter would be greatly appreciated.\n", + "\n", + "Yours faithfully,\n", + "\n", + "[Your Name]\n" + ] + } + ], + "source": [ + "from langchain.prompts.prompt import PromptTemplate\n", + "from langchain.chat_models import ChatOpenAI\n", + "\n", + "anonymizer = PresidioReversibleAnonymizer()\n", + "\n", + "template = \"\"\"Rewrite this text into an official, short email:\n", + "\n", + "{anonymized_text}\"\"\"\n", + "prompt = PromptTemplate.from_template(template)\n", + "llm = ChatOpenAI(temperature=0)\n", + "\n", + "chain = {\"anonymized_text\": anonymizer.anonymize} | prompt | llm\n", + "response = chain.invoke(text)\n", + "print(response.content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's add **deanonymization step** to our sequence:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dear Sir/Madam,\n", + "\n", + "We regret to inform you that Mr. Slim Shady has recently misplaced his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4916 0387 9536 0861. \n", + "\n", + "If by any chance you come across the lost wallet, kindly contact us immediately at 313-666-7440 or send an email to real.slim.shady@gmail.com.\n", + "\n", + "Your prompt assistance in this matter would be greatly appreciated.\n", + "\n", + "Yours faithfully,\n", + "\n", + "[Your Name]\n" + ] + } + ], + "source": [ + "chain = chain | (lambda ai_message: anonymizer.deanonymize(ai_message.content))\n", + "response = chain.invoke(text)\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Anonymized data was given to the model itself, and therefore it was protected from being leaked to the outside world. Then, the model's response was processed, and the factual value was replaced with the real one." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Extra knowledge" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`PresidioReversibleAnonymizer` stores the mapping of the fake values to the original values in the `deanonymizer_mapping` parameter, where key is fake PII and value is the original one: " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'PERSON': {'Maria Lynch': 'Slim Shady'},\n", + " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", + " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", + " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861'}}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n", + "\n", + "anonymizer = PresidioReversibleAnonymizer(\n", + " analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n", + " # Faker seed is used here to make sure the same fake data is generated for the test purposes\n", + " # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n", + " faker_seed=42,\n", + ")\n", + "\n", + "anonymizer.anonymize(\n", + " \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n", + " \"By the way, my card number is: 4916 0387 9536 0861\"\n", + ")\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Anonymizing more texts will result in new mapping entries:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Do you have his VISA card number? Yep, it's 3537672423884966. I'm William Bowman by the way.\n" + ] }, - "nbformat": 4, - "nbformat_minor": 4 -} \ No newline at end of file + { + "data": { + "text/plain": [ + "{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n", + " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", + " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", + " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n", + " '3537672423884966': '4001 9192 5753 7193'}}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\n", + " anonymizer.anonymize(\n", + " \"Do you have his VISA card number? Yep, it's 4001 9192 5753 7193. I'm John Doe by the way.\"\n", + " )\n", + ")\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can save the mapping itself to a file for future use: " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# We can save the deanonymizer mapping as a JSON or YAML file\n", + "\n", + "anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n", + "# anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.yaml\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then, load it in another `PresidioReversibleAnonymizer` instance:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymizer = PresidioReversibleAnonymizer()\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n", + " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", + " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", + " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n", + " '3537672423884966': '4001 9192 5753 7193'}}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymizer.load_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Future works\n", + "\n", + "- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object.\n", + "- **better matching and substitution of fake values for real ones** - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. *John Doe* -> *John* or *Main St, New York* -> *New York*) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 49e0c83126b9c77ed5d113f96ad12f944439b087 Mon Sep 17 00:00:00 2001 From: Bagatur <22008038+baskaryan@users.noreply.github.com> Date: Thu, 7 Sep 2023 14:56:38 -0700 Subject: [PATCH 06/13] Split LCEL cookbook (#10342) --- .../extras/expression_language/cookbook.ipynb | 1831 ----------------- .../cookbook/code_writing.ipynb | 119 ++ .../expression_language/cookbook/index.mdx | 11 + .../expression_language/cookbook/memory.ipynb | 180 ++ .../cookbook/moderation.ipynb | 133 ++ .../cookbook/multiple_chains.ipynb | 240 +++ .../cookbook/prompt_llm_parser.ipynb | 431 ++++ .../cookbook/retrieval.ipynb | 461 +++++ .../expression_language/cookbook/sql_db.ipynb | 227 ++ .../expression_language/cookbook/tools.ipynb | 122 ++ .../expression_language/how_to/_category_.yml | 2 + .../how_to/functions.ipynb | 158 ++ .../expression_language/interface.ipynb | 15 +- 13 files changed, 2096 insertions(+), 1834 deletions(-) delete mode 100644 docs/extras/expression_language/cookbook.ipynb create mode 100644 docs/extras/expression_language/cookbook/code_writing.ipynb create mode 100644 docs/extras/expression_language/cookbook/index.mdx create mode 100644 docs/extras/expression_language/cookbook/memory.ipynb create mode 100644 docs/extras/expression_language/cookbook/moderation.ipynb create mode 100644 docs/extras/expression_language/cookbook/multiple_chains.ipynb create mode 100644 docs/extras/expression_language/cookbook/prompt_llm_parser.ipynb create mode 100644 docs/extras/expression_language/cookbook/retrieval.ipynb create mode 100644 docs/extras/expression_language/cookbook/sql_db.ipynb create mode 100644 docs/extras/expression_language/cookbook/tools.ipynb create mode 100644 docs/extras/expression_language/how_to/_category_.yml create mode 100644 docs/extras/expression_language/how_to/functions.ipynb diff --git a/docs/extras/expression_language/cookbook.ipynb b/docs/extras/expression_language/cookbook.ipynb deleted file mode 100644 index c10d0a7672..0000000000 --- a/docs/extras/expression_language/cookbook.ipynb +++ /dev/null @@ -1,1831 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "9a9acd2e", - "metadata": {}, - "source": [ - "# Cookbook\n", - "\n", - "In this notebook we'll take a look at a few common types of sequences to create." - ] - }, - { - "cell_type": "markdown", - "id": "93aa2c87", - "metadata": {}, - "source": [ - "## PromptTemplate + LLM\n", - "\n", - "A PromptTemplate -> LLM is a core chain that is used in most other larger chains/systems." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "466b65b3", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.prompts import ChatPromptTemplate\n", - "from langchain.chat_models import ChatOpenAI" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "3c634ef0", - "metadata": {}, - "outputs": [], - "source": [ - "model = ChatOpenAI()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "d1850a1f", - "metadata": {}, - "outputs": [], - "source": [ - "prompt = ChatPromptTemplate.from_template(\"tell me a joke about {foo}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "56d0669f", - "metadata": {}, - "outputs": [], - "source": [ - "chain = prompt | model" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "e3d0a6cd", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AIMessage(content=\"Why don't bears wear shoes?\\n\\nBecause they have bear feet!\", additional_kwargs={}, example=False)" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"foo\": \"bears\"})" - ] - }, - { - "cell_type": "markdown", - "id": "7eb9ef50", - "metadata": {}, - "source": [ - "Often times we want to attach kwargs to the model that's passed in. Here's a few examples of that:" - ] - }, - { - "cell_type": "markdown", - "id": "0b1d8f88", - "metadata": {}, - "source": [ - "### Attaching Stop Sequences" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "562a06bf", - "metadata": {}, - "outputs": [], - "source": [ - "chain = prompt | model.bind(stop=[\"\\n\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "43f5d04c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AIMessage(content=\"Why don't bears wear shoes?\", additional_kwargs={}, example=False)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"foo\": \"bears\"})" - ] - }, - { - "cell_type": "markdown", - "id": "f3eaf88a", - "metadata": {}, - "source": [ - "### Attaching Function Call information" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "f94b71b2", - "metadata": {}, - "outputs": [], - "source": [ - "functions = [\n", - " {\n", - " \"name\": \"joke\",\n", - " \"description\": \"A joke\",\n", - " \"parameters\": {\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"setup\": {\n", - " \"type\": \"string\",\n", - " \"description\": \"The setup for the joke\"\n", - " },\n", - " \"punchline\": {\n", - " \"type\": \"string\",\n", - " \"description\": \"The punchline for the joke\"\n", - " }\n", - " },\n", - " \"required\": [\"setup\", \"punchline\"]\n", - " }\n", - " }\n", - " ]\n", - "chain = prompt | model.bind(function_call= {\"name\": \"joke\"}, functions= functions)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "decf7710", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AIMessage(content='', additional_kwargs={'function_call': {'name': 'joke', 'arguments': '{\\n \"setup\": \"Why don\\'t bears wear shoes?\",\\n \"punchline\": \"Because they have bear feet!\"\\n}'}}, example=False)" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"foo\": \"bears\"}, config={})" - ] - }, - { - "cell_type": "markdown", - "id": "9098c5ed", - "metadata": {}, - "source": [ - "## PromptTemplate + LLM + OutputParser\n", - "\n", - "We can also add in an output parser to easily trasform the raw LLM/ChatModel output into a more workable format" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "f799664d", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.schema.output_parser import StrOutputParser" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "cc194c78", - "metadata": {}, - "outputs": [], - "source": [ - "chain = prompt | model | StrOutputParser()" - ] - }, - { - "cell_type": "markdown", - "id": "77acf448", - "metadata": {}, - "source": [ - "Notice that this now returns a string - a much more workable format for downstream tasks" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "e3d69a18", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"Sure, here's a bear joke for you:\\n\\nWhy don't bears like fast food?\\n\\nBecause they can't catch it!\"" - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"foo\": \"bears\"})" - ] - }, - { - "cell_type": "markdown", - "id": "c01864e5", - "metadata": {}, - "source": [ - "### Functions Output Parser\n", - "\n", - "When you specify the function to return, you may just want to parse that directly" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "ad0dd88e", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser\n", - "chain = (\n", - " prompt \n", - " | model.bind(function_call= {\"name\": \"joke\"}, functions= functions) \n", - " | JsonOutputFunctionsParser()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "1e7aa8eb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'setup': \"Why don't bears wear shoes?\",\n", - " 'punchline': 'Because they have bear feet!'}" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"foo\": \"bears\"})" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "d4aa1a01", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser\n", - "chain = (\n", - " prompt \n", - " | model.bind(function_call= {\"name\": \"joke\"}, functions= functions) \n", - " | JsonKeyOutputFunctionsParser(key_name=\"setup\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "8b6df9ba", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"Why don't bears wear shoes?\"" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"foo\": \"bears\"})" - ] - }, - { - "cell_type": "markdown", - "id": "2ed58136", - "metadata": {}, - "source": [ - "## Passthroughs and itemgetter\n", - "\n", - "Often times when constructing a chain you may want to pass along original input variables to future steps in the chain. How exactly you do this depends on what exactly the input is:\n", - "\n", - "- If the original input was a string, then you likely just want to pass along the string. This can be done with `RunnablePassthrough`. For an example of this, see `LLMChain + Retriever`\n", - "- If the original input was a dictionary, then you likely want to pass along specific keys. This can be done with `itemgetter`. For an example of this see `Multiple LLM Chains`" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "5d3d8ffe", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.schema.runnable import RunnablePassthrough\n", - "from operator import itemgetter" - ] - }, - { - "cell_type": "markdown", - "id": "91c5ef3d", - "metadata": {}, - "source": [ - "## LLMChain + Retriever\n", - "\n", - "Let's now look at adding in a retrieval step, which adds up to a \"retrieval-augmented generation\" chain" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "33be32af", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.vectorstores import Chroma\n", - "from langchain.embeddings import OpenAIEmbeddings\n", - "from langchain.schema.runnable import RunnablePassthrough" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "df3f3fa2", - "metadata": {}, - "outputs": [], - "source": [ - "# Create the retriever\n", - "vectorstore = Chroma.from_texts([\"harrison worked at kensho\"], embedding=OpenAIEmbeddings())\n", - "retriever = vectorstore.as_retriever()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "bfc47ec1", - "metadata": {}, - "outputs": [], - "source": [ - "template = \"\"\"Answer the question based only on the following context:\n", - "{context}\n", - "\n", - "Question: {question}\n", - "\"\"\"\n", - "prompt = ChatPromptTemplate.from_template(template)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "eae31755", - "metadata": {}, - "outputs": [], - "source": [ - "chain = (\n", - " {\"context\": retriever, \"question\": RunnablePassthrough()} \n", - " | prompt \n", - " | model \n", - " | StrOutputParser()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "id": "f3040b0c", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1\n" - ] - }, - { - "data": { - "text/plain": [ - "'Harrison worked at Kensho.'" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke(\"where did harrison work?\")" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "e1d20c7c", - "metadata": {}, - "outputs": [], - "source": [ - "template = \"\"\"Answer the question based only on the following context:\n", - "{context}\n", - "\n", - "Question: {question}\n", - "\n", - "Answer in the following language: {language}\n", - "\"\"\"\n", - "prompt = ChatPromptTemplate.from_template(template)\n", - "\n", - "chain = {\n", - " \"context\": itemgetter(\"question\") | retriever, \n", - " \"question\": itemgetter(\"question\"), \n", - " \"language\": itemgetter(\"language\")\n", - "} | prompt | model | StrOutputParser()" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "7ee8b2d4", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1\n" - ] - }, - { - "data": { - "text/plain": [ - "'Harrison ha lavorato a Kensho.'" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"question\": \"where did harrison work\", \"language\": \"italian\"})" - ] - }, - { - "cell_type": "markdown", - "id": "f007669c", - "metadata": {}, - "source": [ - "## Conversational Retrieval Chain\n", - "\n", - "We can easily add in conversation history. This primarily means adding in chat_message_history" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "3f30c348", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.schema.runnable import RunnableMap\n", - "from langchain.schema import format_document" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "64ab1dbf", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.prompts.prompt import PromptTemplate\n", - "\n", - "_template = \"\"\"Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n", - "\n", - "Chat History:\n", - "{chat_history}\n", - "Follow Up Input: {question}\n", - "Standalone question:\"\"\"\n", - "CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "7d628c97", - "metadata": {}, - "outputs": [], - "source": [ - "template = \"\"\"Answer the question based only on the following context:\n", - "{context}\n", - "\n", - "Question: {question}\n", - "\"\"\"\n", - "ANSWER_PROMPT = ChatPromptTemplate.from_template(template)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "f60a5d0f", - "metadata": {}, - "outputs": [], - "source": [ - "DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template=\"{page_content}\")\n", - "def _combine_documents(docs, document_prompt = DEFAULT_DOCUMENT_PROMPT, document_separator=\"\\n\\n\"):\n", - " doc_strings = [format_document(doc, document_prompt) for doc in docs]\n", - " return document_separator.join(doc_strings)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "7d007db6", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Tuple, List\n", - "def _format_chat_history(chat_history: List[Tuple]) -> str:\n", - " buffer = \"\"\n", - " for dialogue_turn in chat_history:\n", - " human = \"Human: \" + dialogue_turn[0]\n", - " ai = \"Assistant: \" + dialogue_turn[1]\n", - " buffer += \"\\n\" + \"\\n\".join([human, ai])\n", - " return buffer" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "5c32cc89", - "metadata": {}, - "outputs": [], - "source": [ - "_inputs = RunnableMap(\n", - " {\n", - " \"standalone_question\": {\n", - " \"question\": lambda x: x[\"question\"],\n", - " \"chat_history\": lambda x: _format_chat_history(x['chat_history'])\n", - " } | CONDENSE_QUESTION_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),\n", - " }\n", - ")\n", - "_context = {\n", - " \"context\": itemgetter(\"standalone_question\") | retriever | _combine_documents,\n", - " \"question\": lambda x: x[\"standalone_question\"]\n", - "}\n", - "conversational_qa_chain = _inputs | _context | ANSWER_PROMPT | ChatOpenAI()" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "135c8205", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1\n" - ] - }, - { - "data": { - "text/plain": [ - "AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False)" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "conversational_qa_chain.invoke({\n", - " \"question\": \"where did harrison work?\",\n", - " \"chat_history\": [],\n", - "})" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "424e7e7a", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1\n" - ] - }, - { - "data": { - "text/plain": [ - "AIMessage(content='Harrison worked at Kensho.', additional_kwargs={}, example=False)" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "conversational_qa_chain.invoke({\n", - " \"question\": \"where did he work?\",\n", - " \"chat_history\": [(\"Who wrote this notebook?\", \"Harrison\")],\n", - "})" - ] - }, - { - "cell_type": "markdown", - "id": "c5543183", - "metadata": {}, - "source": [ - "### With Memory and returning source documents\n", - "\n", - "This shows how to use memory with the above. For memory, we need to manage that outside at the memory. For returning the retrieved documents, we just need to pass them through all the way." - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "e31dd17c", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.memory import ConversationBufferMemory" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "d4bffe94", - "metadata": {}, - "outputs": [], - "source": [ - "memory = ConversationBufferMemory(return_messages=True, output_key=\"answer\", input_key=\"question\")" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "733be985", - "metadata": {}, - "outputs": [], - "source": [ - "# First we add a step to load memory\n", - "# This needs to be a RunnableMap because its the first input\n", - "loaded_memory = RunnableMap(\n", - " {\n", - " \"question\": itemgetter(\"question\"),\n", - " \"memory\": memory.load_memory_variables,\n", - " }\n", - ")\n", - "# Next we add a step to expand memory into the variables\n", - "expanded_memory = {\n", - " \"question\": itemgetter(\"question\"),\n", - " \"chat_history\": lambda x: x[\"memory\"][\"history\"]\n", - "}\n", - "\n", - "# Now we calculate the standalone question\n", - "standalone_question = {\n", - " \"standalone_question\": {\n", - " \"question\": lambda x: x[\"question\"],\n", - " \"chat_history\": lambda x: _format_chat_history(x['chat_history'])\n", - " } | CONDENSE_QUESTION_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),\n", - "}\n", - "# Now we retrieve the documents\n", - "retrieved_documents = {\n", - " \"docs\": itemgetter(\"standalone_question\") | retriever,\n", - " \"question\": lambda x: x[\"standalone_question\"]\n", - "}\n", - "# Now we construct the inputs for the final prompt\n", - "final_inputs = {\n", - " \"context\": lambda x: _combine_documents(x[\"docs\"]),\n", - " \"question\": itemgetter(\"question\")\n", - "}\n", - "# And finally, we do the part that returns the answers\n", - "answer = {\n", - " \"answer\": final_inputs | ANSWER_PROMPT | ChatOpenAI(),\n", - " \"docs\": itemgetter(\"docs\"),\n", - "}\n", - "# And now we put it all together!\n", - "final_chain = loaded_memory | expanded_memory | standalone_question | retrieved_documents | answer" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "806e390c", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1\n" - ] - }, - { - "data": { - "text/plain": [ - "{'answer': AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False),\n", - " 'docs': [Document(page_content='harrison worked at kensho', metadata={})]}" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "inputs = {\"question\": \"where did harrison work?\"}\n", - "result = final_chain.invoke(inputs)\n", - "result" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "977399fd", - "metadata": {}, - "outputs": [], - "source": [ - "# Note that the memory does not save automatically\n", - "# This will be improved in the future\n", - "# For now you need to save it yourself\n", - "memory.save_context(inputs, {\"answer\": result[\"answer\"].content})" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "f94f7de4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'history': [HumanMessage(content='where did harrison work?', additional_kwargs={}, example=False),\n", - " AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False)]}" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "memory.load_memory_variables({})" - ] - }, - { - "cell_type": "markdown", - "id": "0f2bf8d3", - "metadata": {}, - "source": [ - "## Multiple LLM Chains\n", - "\n", - "This can also be used to string together multiple LLMChains" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "d65d4e9e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'El país en el que se encuentra la ciudad de Honolulu, Hawái, donde nació Barack Obama, el 44º presidente de los Estados Unidos, es Estados Unidos.'" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from operator import itemgetter\n", - "\n", - "prompt1 = ChatPromptTemplate.from_template(\"what is the city {person} is from?\")\n", - "prompt2 = ChatPromptTemplate.from_template(\"what country is the city {city} in? respond in {language}\")\n", - "\n", - "chain1 = prompt1 | model | StrOutputParser()\n", - "\n", - "chain2 = {\"city\": chain1, \"language\": itemgetter(\"language\")} | prompt2 | model | StrOutputParser()\n", - "\n", - "chain2.invoke({\"person\": \"obama\", \"language\": \"spanish\"})" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "878f8176", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.schema.runnable import RunnableMap\n", - "prompt1 = ChatPromptTemplate.from_template(\"generate a random color\")\n", - "prompt2 = ChatPromptTemplate.from_template(\"what is a fruit of color: {color}\")\n", - "prompt3 = ChatPromptTemplate.from_template(\"what is countries flag that has the color: {color}\")\n", - "prompt4 = ChatPromptTemplate.from_template(\"What is the color of {fruit} and {country}\")\n", - "chain1 = prompt1 | model | StrOutputParser()\n", - "chain2 = RunnableMap(steps={\"color\": chain1}) | {\n", - " \"fruit\": prompt2 | model | StrOutputParser(),\n", - " \"country\": prompt3 | model | StrOutputParser(),\n", - "} | prompt4" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "d621a870", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ChatPromptValue(messages=[HumanMessage(content=\"What is the color of A fruit that is of color #FF4500 is typically an orange fruit. and The country's flag that has the color #FF4500 is the flag of India.\", additional_kwargs={}, example=False)])" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain2.invoke({})" - ] - }, - { - "cell_type": "markdown", - "id": "6d75a313-f1c8-4e94-9a17-24e0bf4a2bdc", - "metadata": {}, - "source": [ - "### Branching and Merging\n", - "\n", - "You may want the output of one component to be processed by 2 or more other components. [RunnableMaps](https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.base.RunnableMap.html) let you split or fork the chain so multiple components can process the input in parallel. Later, other components can join or merge the results to synthesize a final response. This type of chain creates a computation graph that looks like the following:\n", - "\n", - "```text\n", - " Input\n", - " / \\\n", - " / \\\n", - " Branch1 Branch2\n", - " \\ /\n", - " \\ /\n", - " Combine\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "id": "247fa0bd-4596-4063-8cb3-1d7fc119d982", - "metadata": {}, - "outputs": [], - "source": [ - "planner = (\n", - " ChatPromptTemplate.from_template(\n", - " \"Generate an argument about: {input}\"\n", - " )\n", - " | ChatOpenAI()\n", - " | StrOutputParser()\n", - " | {\"base_response\": RunnablePassthrough()}\n", - ")\n", - "\n", - "arguments_for = (\n", - " ChatPromptTemplate.from_template(\n", - " \"List the pros or positive aspects of {base_response}\"\n", - " )\n", - " | ChatOpenAI()\n", - " | StrOutputParser()\n", - ")\n", - "arguments_against = (\n", - " ChatPromptTemplate.from_template(\n", - " \"List the cons or negative aspects of {base_response}\"\n", - " )\n", - " | ChatOpenAI()\n", - " | StrOutputParser()\n", - ")\n", - "\n", - "final_responder = (\n", - " ChatPromptTemplate.from_messages(\n", - " [\n", - " (\"ai\", \"{original_response}\"),\n", - " (\"human\", \"Pros:\\n{results_1}\\n\\nCons:\\n{results_2}\"),\n", - " (\"system\", \"Generate a final response given the critique\"),\n", - " ]\n", - " )\n", - " | ChatOpenAI()\n", - " | StrOutputParser()\n", - ")\n", - "\n", - "chain = (\n", - " planner \n", - " | {\n", - " \"results_1\": arguments_for,\n", - " \"results_2\": arguments_against,\n", - " \"original_response\": itemgetter(\"base_response\"),\n", - " }\n", - " | final_responder\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 65, - "id": "2564f310-0674-4bb1-9c4e-d7848ca73511", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"While Scrum has its limitations and potential drawbacks, it is important to note that these can be mitigated with proper understanding, implementation, and adaptation. Here are some ways to address the critique:\\n\\n1. Lack of structure: While Scrum promotes self-organization, it is essential to provide clear guidelines, define roles and responsibilities, and establish a shared understanding of the project's goals and expectations. This can be achieved through effective communication and regular alignment meetings.\\n\\n2. Time and resource constraints: Proper planning, prioritization, and resource allocation are crucial in managing the sprint cycles effectively. Teams can leverage tools and techniques such as backlog refinement, sprint planning, and capacity planning to ensure that workloads are manageable and realistic.\\n\\n3. Managing large teams: Scaling frameworks like Scrum of Scrums or LeSS (Large-Scale Scrum) can be implemented to coordinate the efforts of multiple Scrum teams. These frameworks provide mechanisms for communication, synchronization, and alignment across teams.\\n\\n4. Limited documentation: Although Scrum emphasizes working software over comprehensive documentation, it is important to strike a balance. Teams can adopt lightweight documentation practices such as user stories, acceptance criteria, and sprint reviews to capture relevant information and promote knowledge transfer.\\n\\n5. Resolving conflicts and fostering collaboration: Conflict resolution techniques and team-building activities can help address conflicts and foster a collaborative environment. Encouraging open and honest communication, promoting a culture of trust and respect, and providing opportunities for team members to share ideas and perspectives can contribute to better team dynamics.\\n\\n6. Long-term planning: While Scrum focuses on short-term goals, it is still important to have a long-term vision and roadmap. Teams can incorporate longer-term planning activities, such as release planning or product roadmapping, to align the project with broader strategic objectives and ensure a balance between adaptability and long-term goals.\\n\\n7. Skilled Scrum Master: Investing in the training and development of a skilled Scrum Master is crucial. Organizations can provide training and support for Scrum Masters to enhance their understanding of Scrum principles, facilitation skills, and ability to address challenges effectively.\\n\\n8. Scope management: To prevent scope creep, teams should establish a well-defined product backlog and prioritize requirements based on value and feasibility. Regular backlog refinement and stakeholder engagement can help ensure that changes are evaluated and incorporated in a controlled manner.\\n\\n9. Applicability to different domains: While Scrum originated in software development, it has been successfully applied in various industries and domains. Organizations can tailor Scrum practices to suit their specific needs, making necessary adaptations and incorporating domain-specific practices as required.\\n\\nBy addressing these concerns and adapting Scrum to the specific context, organizations can maximize the benefits of Scrum while mitigating potential drawbacks. It is important to continuously evaluate and improve the implementation to ensure the best outcomes for the project and the team.\"" - ] - }, - "execution_count": 65, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"input\": \"scrum\"})" - ] - }, - { - "cell_type": "markdown", - "id": "d094d637", - "metadata": {}, - "source": [ - "## Router\n", - "\n", - "You can also use the router runnable to conditionally route inputs to different runnables." - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "id": "252625fd", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.chains import create_tagging_chain_pydantic\n", - "from pydantic import BaseModel, Field\n", - "\n", - "class PromptToUse(BaseModel):\n", - " \"\"\"Used to determine which prompt to use to answer the user's input.\"\"\"\n", - " \n", - " name: str = Field(description=\"Should be one of `math` or `english`\")" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "id": "57886e84", - "metadata": {}, - "outputs": [], - "source": [ - "tagger = create_tagging_chain_pydantic(PromptToUse, ChatOpenAI(temperature=0))" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "id": "a303b089", - "metadata": {}, - "outputs": [], - "source": [ - "chain1 = ChatPromptTemplate.from_template(\"You are a math genius. Answer the question: {question}\") | ChatOpenAI()\n", - "chain2 = ChatPromptTemplate.from_template(\"You are an english major. Answer the question: {question}\") | ChatOpenAI()" - ] - }, - { - "cell_type": "code", - "execution_count": 69, - "id": "7aa9ea06", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.schema.runnable import RouterRunnable\n", - "router = RouterRunnable({\"math\": chain1, \"english\": chain2})" - ] - }, - { - "cell_type": "code", - "execution_count": 70, - "id": "6a3d3f5d", - "metadata": {}, - "outputs": [], - "source": [ - "chain = {\n", - " \"key\": {\"input\": lambda x: x[\"question\"]} | tagger | (lambda x: x['text'].name),\n", - " \"input\": {\"question\": lambda x: x[\"question\"]}\n", - "} | router" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "8aeda930", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AIMessage(content='Thank you for the compliment! The sum of 2 and 2 is 4.', additional_kwargs={}, example=False)" - ] - }, - "execution_count": 71, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"question\": \"whats 2 + 2\"})" - ] - }, - { - "cell_type": "markdown", - "id": "29781123", - "metadata": {}, - "source": [ - "## Tools\n", - "\n", - "You can use any LangChain tool easily." - ] - }, - { - "cell_type": "code", - "execution_count": 72, - "id": "9232d2a9", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.tools import DuckDuckGoSearchRun" - ] - }, - { - "cell_type": "code", - "execution_count": 73, - "id": "a0c64d2c", - "metadata": {}, - "outputs": [], - "source": [ - "search = DuckDuckGoSearchRun()" - ] - }, - { - "cell_type": "code", - "execution_count": 74, - "id": "391969b6", - "metadata": {}, - "outputs": [], - "source": [ - "template = \"\"\"turn the following user input into a search query for a search engine:\n", - "\n", - "{input}\"\"\"\n", - "prompt = ChatPromptTemplate.from_template(template)" - ] - }, - { - "cell_type": "code", - "execution_count": 75, - "id": "e3d9d20d", - "metadata": {}, - "outputs": [], - "source": [ - "chain = prompt | model | StrOutputParser() | search" - ] - }, - { - "cell_type": "code", - "execution_count": 76, - "id": "55f2967d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"What sports games are on TV today & tonight? Watch and stream live sports on TV today, tonight, tomorrow. Today's 2023 sports TV schedule includes football, basketball, baseball, hockey, motorsports, soccer and more. Watch on TV or stream online on ESPN, FOX, FS1, CBS, NBC, ABC, Peacock, Paramount+, fuboTV, local channels and many other networks. MLB Games Tonight: How to Watch on TV, Streaming & Odds - Wednesday, September 6. Texas Rangers second baseman Marcus Semien, left, tags out Houston Astros' Jose Altuve (27) who was attempting to stretch out a single in the seventh inning of a baseball game, Monday, Sept. 4, 2023, in Arlington, Texas. (AP Photo/Tony Gutierrez) (APMedia) There ... MLB Games Tonight: How to Watch on TV, Streaming & Odds - Sunday, September 3. Los Angeles Dodgers right fielder Mookie Betts, left, gives a thumbs up to Vanessa Bryant, right, widow of Kobe ... WEEK 16 NFL TV SCHEDULE. NFL Games Thursday, 12/21/23. TIME ET. TV. New Orleans at LA Rams. 8:15pm. AMZN. NFL Games Saturday, 12/23/23. TIME ET. The second half of tonight's college football schedule still has some good games remaining to watch on your television.. We've already seen an exciting one when Colorado upset TCU. And we saw some ...\"" - ] - }, - "execution_count": 76, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"input\": \"I'd like to figure out what games are tonight\"})" - ] - }, - { - "cell_type": "markdown", - "id": "fbc4bf6e", - "metadata": {}, - "source": [ - "## Arbitrary Functions\n", - "\n", - "You can use arbitrary functions in the pipeline\n", - "\n", - "Note that all inputs to these functions need to be a SINGLE argument. If you have a function that accepts multiple arguments, you should write a wrapper that accepts a single input and unpacks it into multiple argument." - ] - }, - { - "cell_type": "code", - "execution_count": 77, - "id": "6bb221b3", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.schema.runnable import RunnableLambda\n", - "\n", - "def length_function(text):\n", - " return len(text)\n", - "\n", - "def _multiple_length_function(text1, text2):\n", - " return len(text1) * len(text2)\n", - "\n", - "def multiple_length_function(_dict):\n", - " return _multiple_length_function(_dict[\"text1\"], _dict[\"text2\"])\n", - "\n", - "prompt = ChatPromptTemplate.from_template(\"what is {a} + {b}\")\n", - "\n", - "chain1 = prompt | model\n", - "\n", - "chain = {\n", - " \"a\": itemgetter(\"foo\") | RunnableLambda(length_function),\n", - " \"b\": {\"text1\": itemgetter(\"foo\"), \"text2\": itemgetter(\"bar\")} | RunnableLambda(multiple_length_function)\n", - "} | prompt | model" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "id": "5488ec85", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AIMessage(content='3 + 9 equals 12.', additional_kwargs={}, example=False)" - ] - }, - "execution_count": 78, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"foo\": \"bar\", \"bar\": \"gah\"})" - ] - }, - { - "cell_type": "markdown", - "id": "4728ddd9-914d-42ce-ae9b-72c9ce8ec940", - "metadata": {}, - "source": [ - "## Accepting a Runnable Config\n", - "\n", - "Runnable lambdas can optionally accept a [RunnableConfig](https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.config.RunnableConfig.html?highlight=runnableconfig#langchain.schema.runnable.config.RunnableConfig), which they can use to pass callbacks, tags, and other configuration information to nested runs." - ] - }, - { - "cell_type": "code", - "execution_count": 139, - "id": "80b3b5f6-5d58-44b9-807e-cce9a46bf49f", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.schema.runnable import RunnableConfig" - ] - }, - { - "cell_type": "code", - "execution_count": 149, - "id": "ff0daf0c-49dd-4d21-9772-e5fa133c5f36", - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "\n", - "def parse_or_fix(text: str, config: RunnableConfig):\n", - " fixing_chain = (\n", - " ChatPromptTemplate.from_template(\n", - " \"Fix the following text:\\n\\n```text\\n{input}\\n```\\nError: {error}\"\n", - " \" Don't narrate, just respond with the fixed data.\"\n", - " )\n", - " | ChatOpenAI()\n", - " | StrOutputParser()\n", - " )\n", - " for _ in range(3):\n", - " try:\n", - " return json.loads(text)\n", - " except Exception as e:\n", - " text = fixing_chain.invoke({\"input\": text, \"error\": e}, config)\n", - " return \"Failed to parse\"" - ] - }, - { - "cell_type": "code", - "execution_count": 152, - "id": "1a5e709e-9d75-48c7-bb9c-503251990505", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Tokens Used: 65\n", - "\tPrompt Tokens: 56\n", - "\tCompletion Tokens: 9\n", - "Successful Requests: 1\n", - "Total Cost (USD): $0.00010200000000000001\n" - ] - } - ], - "source": [ - "from langchain.callbacks import get_openai_callback\n", - "\n", - "with get_openai_callback() as cb:\n", - " RunnableLambda(parse_or_fix).invoke(\"{foo: bar}\", {\"tags\": [\"my-tag\"], \"callbacks\": [cb]})\n", - " print(cb)" - ] - }, - { - "cell_type": "markdown", - "id": "506e9636", - "metadata": {}, - "source": [ - "## SQL Database\n", - "\n", - "We can also try to replicate our SQLDatabaseChain using this style." - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "id": "7a927516", - "metadata": {}, - "outputs": [], - "source": [ - "template = \"\"\"Based on the table schema below, write a SQL query that would answer the user's question:\n", - "{schema}\n", - "\n", - "Question: {question}\n", - "SQL Query:\"\"\"\n", - "prompt = ChatPromptTemplate.from_template(template)" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "id": "3f51f386", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.utilities import SQLDatabase" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "id": "2ccca6fc", - "metadata": {}, - "outputs": [], - "source": [ - "db = SQLDatabase.from_uri(\"sqlite:///../../../../notebooks/Chinook.db\")" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "id": "05ba88ee", - "metadata": {}, - "outputs": [], - "source": [ - "def get_schema(_):\n", - " return db.get_table_info()" - ] - }, - { - "cell_type": "code", - "execution_count": 112, - "id": "a4eda902", - "metadata": {}, - "outputs": [], - "source": [ - "def run_query(query):\n", - " return db.run(query)" - ] - }, - { - "cell_type": "code", - "execution_count": 113, - "id": "5046cb17", - "metadata": {}, - "outputs": [], - "source": [ - "inputs = {\n", - " \"schema\": RunnableLambda(get_schema),\n", - " \"question\": itemgetter(\"question\")\n", - "}\n", - "sql_response = (\n", - " RunnableMap(inputs)\n", - " | prompt\n", - " | model.bind(stop=[\"\\nSQLResult:\"])\n", - " | StrOutputParser()\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 114, - "id": "a5552039", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'SELECT COUNT(EmployeeId) FROM Employee'" - ] - }, - "execution_count": 114, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sql_response.invoke({\"question\": \"How many employees are there?\"})" - ] - }, - { - "cell_type": "code", - "execution_count": 115, - "id": "d6fee130", - "metadata": {}, - "outputs": [], - "source": [ - "template = \"\"\"Based on the table schema below, question, sql query, and sql response, write a natural language response:\n", - "{schema}\n", - "\n", - "Question: {question}\n", - "SQL Query: {query}\n", - "SQL Response: {response}\"\"\"\n", - "prompt_response = ChatPromptTemplate.from_template(template)" - ] - }, - { - "cell_type": "code", - "execution_count": 116, - "id": "923aa634", - "metadata": {}, - "outputs": [], - "source": [ - "full_chain = (\n", - " RunnableMap({\n", - " \"question\": itemgetter(\"question\"),\n", - " \"query\": sql_response,\n", - " }) \n", - " | {\n", - " \"schema\": RunnableLambda(get_schema),\n", - " \"question\": itemgetter(\"question\"),\n", - " \"query\": itemgetter(\"query\"),\n", - " \"response\": lambda x: db.run(x[\"query\"]) \n", - " } \n", - " | prompt_response \n", - " | model\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 117, - "id": "e94963d8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AIMessage(content='There are 8 employees.', additional_kwargs={}, example=False)" - ] - }, - "execution_count": 117, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "full_chain.invoke({\"question\": \"How many employees are there?\"})" - ] - }, - { - "cell_type": "markdown", - "id": "f09fd305", - "metadata": {}, - "source": [ - "## Code Writing" - ] - }, - { - "cell_type": "code", - "execution_count": 118, - "id": "bd7c259a", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.utilities import PythonREPL\n", - "from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate" - ] - }, - { - "cell_type": "code", - "execution_count": 119, - "id": "73795d2d", - "metadata": {}, - "outputs": [], - "source": [ - "template = \"\"\"Write some python code to solve the user's problem. \n", - "\n", - "Return only python code in Markdown format, e.g.:\n", - "\n", - "```python\n", - "....\n", - "```\"\"\"\n", - "prompt = ChatPromptTemplate(messages=[\n", - " SystemMessagePromptTemplate.from_template(template),\n", - " HumanMessagePromptTemplate.from_template(\"{input}\")\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "id": "42859e8a", - "metadata": {}, - "outputs": [], - "source": [ - "def _sanitize_output(text: str):\n", - " _, after = text.split(\"```python\")\n", - " return after.split(\"```\")[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 121, - "id": "5ded1a86", - "metadata": {}, - "outputs": [], - "source": [ - "chain = prompt | model | StrOutputParser() | _sanitize_output | PythonREPL().run" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "id": "208c2b75", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Python REPL can execute arbitrary code. Use with caution.\n" - ] - }, - { - "data": { - "text/plain": [ - "'4\\n'" - ] - }, - "execution_count": 122, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"input\": \"whats 2 plus 2\"})" - ] - }, - { - "cell_type": "markdown", - "id": "5062941a", - "metadata": {}, - "source": [ - "## Memory\n", - "\n", - "This shows how to add memory to an arbitrary chain. Right now, you can use the memory classes but need to hook it up manually" - ] - }, - { - "cell_type": "code", - "execution_count": 123, - "id": "7998efd8", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.memory import ConversationBufferMemory\n", - "from langchain.schema.runnable import RunnableMap\n", - "from langchain.prompts import MessagesPlaceholder\n", - "model = ChatOpenAI()\n", - "prompt = ChatPromptTemplate.from_messages([\n", - " (\"system\", \"You are a helpful chatbot\"),\n", - " MessagesPlaceholder(variable_name=\"history\"),\n", - " (\"human\", \"{input}\")\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": 124, - "id": "fa0087f3", - "metadata": {}, - "outputs": [], - "source": [ - "memory = ConversationBufferMemory(return_messages=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 125, - "id": "06b531ae", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'history': []}" - ] - }, - "execution_count": 125, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "memory.load_memory_variables({})" - ] - }, - { - "cell_type": "code", - "execution_count": 126, - "id": "d9437af6", - "metadata": {}, - "outputs": [], - "source": [ - "chain = RunnableMap({\n", - " \"input\": lambda x: x[\"input\"],\n", - " \"memory\": memory.load_memory_variables\n", - "}) | {\n", - " \"input\": lambda x: x[\"input\"],\n", - " \"history\": lambda x: x[\"memory\"][\"history\"]\n", - "} | prompt | model" - ] - }, - { - "cell_type": "code", - "execution_count": 127, - "id": "bed1e260", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AIMessage(content='Hello Bob! How can I assist you today?', additional_kwargs={}, example=False)" - ] - }, - "execution_count": 127, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "inputs = {\"input\": \"hi im bob\"}\n", - "response = chain.invoke(inputs)\n", - "response" - ] - }, - { - "cell_type": "code", - "execution_count": 128, - "id": "890475b4", - "metadata": {}, - "outputs": [], - "source": [ - "memory.save_context(inputs, {\"output\": response.content})" - ] - }, - { - "cell_type": "code", - "execution_count": 129, - "id": "e8fcb77f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'history': [HumanMessage(content='hi im bob', additional_kwargs={}, example=False),\n", - " AIMessage(content='Hello Bob! How can I assist you today?', additional_kwargs={}, example=False)]}" - ] - }, - "execution_count": 129, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "memory.load_memory_variables({})" - ] - }, - { - "cell_type": "code", - "execution_count": 130, - "id": "d837d5c3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AIMessage(content='Your name is Bob.', additional_kwargs={}, example=False)" - ] - }, - "execution_count": 130, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "inputs = {\"input\": \"whats my name\"}\n", - "response = chain.invoke(inputs)\n", - "response" - ] - }, - { - "cell_type": "markdown", - "id": "4927a727-b4c8-453c-8c83-bd87b4fcac14", - "metadata": {}, - "source": [ - "## Moderation\n", - "\n", - "This shows how to add in moderation (or other safeguards) around your LLM application." - ] - }, - { - "cell_type": "code", - "execution_count": 131, - "id": "4f5f6449-940a-4f5c-97c0-39b71c3e2a68", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.chains import OpenAIModerationChain\n", - "from langchain.llms import OpenAI" - ] - }, - { - "cell_type": "code", - "execution_count": 132, - "id": "fcb8312b-7e7a-424f-a3ec-76738c9a9d21", - "metadata": {}, - "outputs": [], - "source": [ - "moderate = OpenAIModerationChain()" - ] - }, - { - "cell_type": "code", - "execution_count": 133, - "id": "b24b9148-f6b0-4091-8ea8-d3fb281bd950", - "metadata": {}, - "outputs": [], - "source": [ - "model = OpenAI()\n", - "prompt = ChatPromptTemplate.from_messages([\n", - " (\"system\", \"repeat after me: {input}\")\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": 134, - "id": "1c8ed87c-9ca6-4559-bf60-d40e94a0af08", - "metadata": {}, - "outputs": [], - "source": [ - "chain = prompt | model" - ] - }, - { - "cell_type": "code", - "execution_count": 135, - "id": "5256b9bd-381a-42b0-bfa8-7e6d18f853cb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\n\\nYou are stupid.'" - ] - }, - "execution_count": 135, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"input\": \"you are stupid\"})" - ] - }, - { - "cell_type": "code", - "execution_count": 136, - "id": "fe6e3b33-dc9a-49d5-b194-ba750c58a628", - "metadata": {}, - "outputs": [], - "source": [ - "moderated_chain = chain | moderate" - ] - }, - { - "cell_type": "code", - "execution_count": 137, - "id": "d8ba0cbd-c739-4d23-be9f-6ae092bd5ffb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'input': '\\n\\nYou are stupid.',\n", - " 'output': \"Text was found that violates OpenAI's content policy.\"}" - ] - }, - "execution_count": 137, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "moderated_chain.invoke({\"input\": \"you are stupid\"})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f07b5300-8676-48ee-ab77-3f2dc2ecd415", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.2" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/extras/expression_language/cookbook/code_writing.ipynb b/docs/extras/expression_language/cookbook/code_writing.ipynb new file mode 100644 index 0000000000..25b039ce44 --- /dev/null +++ b/docs/extras/expression_language/cookbook/code_writing.ipynb @@ -0,0 +1,119 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f09fd305", + "metadata": {}, + "source": [ + "# Code writing\n", + "\n", + "Example of how to use LCEL to write Python code." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "bd7c259a", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate\n", + "from langchain.schema.output_parser import StrOutputParser\n", + "from langchain.utilities import PythonREPL" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "73795d2d", + "metadata": {}, + "outputs": [], + "source": [ + "template = \"\"\"Write some python code to solve the user's problem. \n", + "\n", + "Return only python code in Markdown format, e.g.:\n", + "\n", + "```python\n", + "....\n", + "```\"\"\"\n", + "prompt = ChatPromptTemplate.from_messages(\n", + " [(\"system\", template), (\"human\", \"{input}\")]\n", + ")\n", + "\n", + "model = ChatOpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "42859e8a", + "metadata": {}, + "outputs": [], + "source": [ + "def _sanitize_output(text: str):\n", + " _, after = text.split(\"```python\")\n", + " return after.split(\"```\")[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5ded1a86", + "metadata": {}, + "outputs": [], + "source": [ + "chain = prompt | model | StrOutputParser() | _sanitize_output | PythonREPL().run" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "208c2b75", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Python REPL can execute arbitrary code. Use with caution.\n" + ] + }, + { + "data": { + "text/plain": [ + "'4\\n'" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"input\": \"whats 2 plus 2\"})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/cookbook/index.mdx b/docs/extras/expression_language/cookbook/index.mdx new file mode 100644 index 0000000000..6310fd50b9 --- /dev/null +++ b/docs/extras/expression_language/cookbook/index.mdx @@ -0,0 +1,11 @@ +--- +sidebar_position: 2 +--- + +# Cookbook + +import DocCardList from "@theme/DocCardList"; + +Example code for accomplishing common tasks with the LangChain Expression Language (LCEL). These examples show how to compose different Runnable (the core LCEL interface) components to achieve various tasks. If you're just getting acquainted with LCEL, the [Prompt + LLM](/docs/expression_language/cookbook/prompt_llm_parser) page is a good place to start. + + \ No newline at end of file diff --git a/docs/extras/expression_language/cookbook/memory.ipynb b/docs/extras/expression_language/cookbook/memory.ipynb new file mode 100644 index 0000000000..bef7e5ed01 --- /dev/null +++ b/docs/extras/expression_language/cookbook/memory.ipynb @@ -0,0 +1,180 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5062941a", + "metadata": {}, + "source": [ + "# Adding memory\n", + "\n", + "This shows how to add memory to an arbitrary chain. Right now, you can use the memory classes but need to hook it up manually" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7998efd8", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.memory import ConversationBufferMemory\n", + "from langchain.schema.runnable import RunnableMap\n", + "from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n", + "\n", + "model = ChatOpenAI()\n", + "prompt = ChatPromptTemplate.from_messages([\n", + " (\"system\", \"You are a helpful chatbot\"),\n", + " MessagesPlaceholder(variable_name=\"history\"),\n", + " (\"human\", \"{input}\")\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fa0087f3", + "metadata": {}, + "outputs": [], + "source": [ + "memory = ConversationBufferMemory(return_messages=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "06b531ae", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'history': []}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "memory.load_memory_variables({})" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d9437af6", + "metadata": {}, + "outputs": [], + "source": [ + "chain = RunnableMap({\n", + " \"input\": lambda x: x[\"input\"],\n", + " \"memory\": memory.load_memory_variables\n", + "}) | {\n", + " \"input\": lambda x: x[\"input\"],\n", + " \"history\": lambda x: x[\"memory\"][\"history\"]\n", + "} | prompt | model" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bed1e260", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='Hello Bob! How can I assist you today?', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs = {\"input\": \"hi im bob\"}\n", + "response = chain.invoke(inputs)\n", + "response" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "890475b4", + "metadata": {}, + "outputs": [], + "source": [ + "memory.save_context(inputs, {\"output\": response.content})" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e8fcb77f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'history': [HumanMessage(content='hi im bob', additional_kwargs={}, example=False),\n", + " AIMessage(content='Hello Bob! How can I assist you today?', additional_kwargs={}, example=False)]}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "memory.load_memory_variables({})" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d837d5c3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='Your name is Bob.', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs = {\"input\": \"whats my name\"}\n", + "response = chain.invoke(inputs)\n", + "response" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/cookbook/moderation.ipynb b/docs/extras/expression_language/cookbook/moderation.ipynb new file mode 100644 index 0000000000..cb4114d8e9 --- /dev/null +++ b/docs/extras/expression_language/cookbook/moderation.ipynb @@ -0,0 +1,133 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4927a727-b4c8-453c-8c83-bd87b4fcac14", + "metadata": {}, + "source": [ + "# Adding moderation\n", + "\n", + "This shows how to add in moderation (or other safeguards) around your LLM application." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "4f5f6449-940a-4f5c-97c0-39b71c3e2a68", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains import OpenAIModerationChain\n", + "from langchain.llms import OpenAI\n", + "from langchain.prompts import ChatPromptTemplate" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fcb8312b-7e7a-424f-a3ec-76738c9a9d21", + "metadata": {}, + "outputs": [], + "source": [ + "moderate = OpenAIModerationChain()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "b24b9148-f6b0-4091-8ea8-d3fb281bd950", + "metadata": {}, + "outputs": [], + "source": [ + "model = OpenAI()\n", + "prompt = ChatPromptTemplate.from_messages([\n", + " (\"system\", \"repeat after me: {input}\")\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "1c8ed87c-9ca6-4559-bf60-d40e94a0af08", + "metadata": {}, + "outputs": [], + "source": [ + "chain = prompt | model" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "5256b9bd-381a-42b0-bfa8-7e6d18f853cb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n\\nYou are stupid.'" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"input\": \"you are stupid\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "fe6e3b33-dc9a-49d5-b194-ba750c58a628", + "metadata": {}, + "outputs": [], + "source": [ + "moderated_chain = chain | moderate" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "d8ba0cbd-c739-4d23-be9f-6ae092bd5ffb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'input': '\\n\\nYou are stupid',\n", + " 'output': \"Text was found that violates OpenAI's content policy.\"}" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "moderated_chain.invoke({\"input\": \"you are stupid\"})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/cookbook/multiple_chains.ipynb b/docs/extras/expression_language/cookbook/multiple_chains.ipynb new file mode 100644 index 0000000000..7db06a85f5 --- /dev/null +++ b/docs/extras/expression_language/cookbook/multiple_chains.ipynb @@ -0,0 +1,240 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "877102d1-02ea-4fa3-8ec7-a08e242b95b3", + "metadata": {}, + "source": [ + "---\n", + "sidebar_position: 2\n", + "title: Multiple chains\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "0f2bf8d3", + "metadata": {}, + "source": [ + "Runnables can easily be used to string together multiple Chains" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d65d4e9e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'El país donde se encuentra la ciudad de Honolulu, donde nació Barack Obama, el 44º Presidente de los Estados Unidos, es Estados Unidos. Honolulu se encuentra en la isla de Oahu, en el estado de Hawái.'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from operator import itemgetter\n", + "\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.prompts import ChatPromptTemplate\n", + "from langchain.schema import StrOutputParser\n", + "\n", + "prompt1 = ChatPromptTemplate.from_template(\"what is the city {person} is from?\")\n", + "prompt2 = ChatPromptTemplate.from_template(\"what country is the city {city} in? respond in {language}\")\n", + "\n", + "model = ChatOpenAI()\n", + "\n", + "chain1 = prompt1 | model | StrOutputParser()\n", + "\n", + "chain2 = {\"city\": chain1, \"language\": itemgetter(\"language\")} | prompt2 | model | StrOutputParser()\n", + "\n", + "chain2.invoke({\"person\": \"obama\", \"language\": \"spanish\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "878f8176", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema.runnable import RunnableMap, RunnablePassthrough\n", + "\n", + "prompt1 = ChatPromptTemplate.from_template(\"generate a {attribute} color. Return the name of the color and nothing else:\")\n", + "prompt2 = ChatPromptTemplate.from_template(\"what is a fruit of color: {color}. Return the name of the fruit and nothing else:\")\n", + "prompt3 = ChatPromptTemplate.from_template(\"what is a country with a flag that has the color: {color}. Return the name of the country and nothing else:\")\n", + "prompt4 = ChatPromptTemplate.from_template(\"What is the color of {fruit} and the flag of {country}?\")\n", + "\n", + "model_parser = model | StrOutputParser()\n", + "\n", + "color_generator = {\"attribute\": RunnablePassthrough()} | prompt1 | {\"color\": model_parser}\n", + "color_to_fruit = prompt2 | model_parser\n", + "color_to_country = prompt3 | model_parser\n", + "question_generator = color_generator | {\"fruit\": color_to_fruit, \"country\": color_to_country} | prompt4" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d621a870", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ChatPromptValue(messages=[HumanMessage(content='What is the color of strawberry and the flag of China?', additional_kwargs={}, example=False)])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "question_generator.invoke({\"warm\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b4a9812b-bead-4fd9-ae27-0b8be57e5dc1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='The color of an apple is typically red or green. The flag of China is predominantly red with a large yellow star in the upper left corner and four smaller yellow stars surrounding it.', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt = question_generator.invoke({\"warm\"})\n", + "model.invoke(prompt)" + ] + }, + { + "cell_type": "markdown", + "id": "6d75a313-f1c8-4e94-9a17-24e0bf4a2bdc", + "metadata": {}, + "source": [ + "### Branching and Merging\n", + "\n", + "You may want the output of one component to be processed by 2 or more other components. [RunnableMaps](https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.base.RunnableMap.html) let you split or fork the chain so multiple components can process the input in parallel. Later, other components can join or merge the results to synthesize a final response. This type of chain creates a computation graph that looks like the following:\n", + "\n", + "```text\n", + " Input\n", + " / \\\n", + " / \\\n", + " Branch1 Branch2\n", + " \\ /\n", + " \\ /\n", + " Combine\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "247fa0bd-4596-4063-8cb3-1d7fc119d982", + "metadata": {}, + "outputs": [], + "source": [ + "planner = (\n", + " ChatPromptTemplate.from_template(\n", + " \"Generate an argument about: {input}\"\n", + " )\n", + " | ChatOpenAI()\n", + " | StrOutputParser()\n", + " | {\"base_response\": RunnablePassthrough()}\n", + ")\n", + "\n", + "arguments_for = (\n", + " ChatPromptTemplate.from_template(\n", + " \"List the pros or positive aspects of {base_response}\"\n", + " )\n", + " | ChatOpenAI()\n", + " | StrOutputParser()\n", + ")\n", + "arguments_against = (\n", + " ChatPromptTemplate.from_template(\n", + " \"List the cons or negative aspects of {base_response}\"\n", + " )\n", + " | ChatOpenAI()\n", + " | StrOutputParser()\n", + ")\n", + "\n", + "final_responder = (\n", + " ChatPromptTemplate.from_messages(\n", + " [\n", + " (\"ai\", \"{original_response}\"),\n", + " (\"human\", \"Pros:\\n{results_1}\\n\\nCons:\\n{results_2}\"),\n", + " (\"system\", \"Generate a final response given the critique\"),\n", + " ]\n", + " )\n", + " | ChatOpenAI()\n", + " | StrOutputParser()\n", + ")\n", + "\n", + "chain = (\n", + " planner \n", + " | {\n", + " \"results_1\": arguments_for,\n", + " \"results_2\": arguments_against,\n", + " \"original_response\": itemgetter(\"base_response\"),\n", + " }\n", + " | final_responder\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "2564f310-0674-4bb1-9c4e-d7848ca73511", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'While Scrum has its potential cons and challenges, many organizations have successfully embraced and implemented this project management framework to great effect. The cons mentioned above can be mitigated or overcome with proper training, support, and a commitment to continuous improvement. It is also important to note that not all cons may be applicable to every organization or project.\\n\\nFor example, while Scrum may be complex initially, with proper training and guidance, teams can quickly grasp the concepts and practices. The lack of predictability can be mitigated by implementing techniques such as velocity tracking and release planning. The limited documentation can be addressed by maintaining a balance between lightweight documentation and clear communication among team members. The dependency on team collaboration can be improved through effective communication channels and regular team-building activities.\\n\\nScrum can be scaled and adapted to larger projects by using frameworks like Scrum of Scrums or LeSS (Large Scale Scrum). Concerns about speed versus quality can be addressed by incorporating quality assurance practices, such as continuous integration and automated testing, into the Scrum process. Scope creep can be managed by having a well-defined and prioritized product backlog, and a strong product owner can be developed through training and mentorship.\\n\\nResistance to change can be overcome by providing proper education and communication to stakeholders and involving them in the decision-making process. Ultimately, the cons of Scrum can be seen as opportunities for growth and improvement, and with the right mindset and support, they can be effectively managed.\\n\\nIn conclusion, while Scrum may have its challenges and potential cons, the benefits and advantages it offers in terms of collaboration, flexibility, adaptability, transparency, and customer satisfaction make it a widely adopted and successful project management framework. With proper implementation and continuous improvement, organizations can leverage Scrum to drive innovation, efficiency, and project success.'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"input\": \"scrum\"})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "poetry-venv", + "language": "python", + "name": "poetry-venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/cookbook/prompt_llm_parser.ipynb b/docs/extras/expression_language/cookbook/prompt_llm_parser.ipynb new file mode 100644 index 0000000000..1b670904d5 --- /dev/null +++ b/docs/extras/expression_language/cookbook/prompt_llm_parser.ipynb @@ -0,0 +1,431 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "abf7263d-3a62-4016-b5d5-b157f92f2070", + "metadata": {}, + "source": [ + "---\n", + "sidebar_position: 0\n", + "title: Prompt + LLM\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "9a434f2b-9405-468c-9dfd-254d456b57a6", + "metadata": {}, + "source": [ + "The most common and valuable composition is taking:\n", + "\n", + "``PromptTemplate`` / ``ChatPromptTemplate`` -> ``LLM`` / ``ChatModel`` -> ``OutputParser``\n", + "\n", + "Almost any other chains you build will use this building block." + ] + }, + { + "cell_type": "markdown", + "id": "93aa2c87", + "metadata": {}, + "source": [ + "## PromptTemplate + LLM\n", + "\n", + "The simplest composition is just combing a prompt and model to create a chain that takes user input, adds it to a prompt, passes it to a model, and returns the raw model input.\n", + "\n", + "Note, you can mix and match PromptTemplate/ChatPromptTemplates and LLMs/ChatModels as you like here." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "466b65b3", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.prompts import ChatPromptTemplate\n", + "from langchain.chat_models import ChatOpenAI\n", + "\n", + "prompt = ChatPromptTemplate.from_template(\"tell me a joke about {foo}\")\n", + "model = ChatOpenAI()\n", + "chain = prompt | model" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e3d0a6cd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content=\"Why don't bears wear shoes?\\n\\nBecause they have bear feet!\", additional_kwargs={}, example=False)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"foo\": \"bears\"})" + ] + }, + { + "cell_type": "markdown", + "id": "7eb9ef50", + "metadata": {}, + "source": [ + "Often times we want to attach kwargs that'll be passed to each model call. Here's a few examples of that:" + ] + }, + { + "cell_type": "markdown", + "id": "0b1d8f88", + "metadata": {}, + "source": [ + "### Attaching Stop Sequences" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "562a06bf", + "metadata": {}, + "outputs": [], + "source": [ + "chain = prompt | model.bind(stop=[\"\\n\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "43f5d04c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='Why did the bear never wear shoes?', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"foo\": \"bears\"})" + ] + }, + { + "cell_type": "markdown", + "id": "f3eaf88a", + "metadata": {}, + "source": [ + "### Attaching Function Call information" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f94b71b2", + "metadata": {}, + "outputs": [], + "source": [ + "functions = [\n", + " {\n", + " \"name\": \"joke\",\n", + " \"description\": \"A joke\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"setup\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The setup for the joke\"\n", + " },\n", + " \"punchline\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The punchline for the joke\"\n", + " }\n", + " },\n", + " \"required\": [\"setup\", \"punchline\"]\n", + " }\n", + " }\n", + " ]\n", + "chain = prompt | model.bind(function_call= {\"name\": \"joke\"}, functions= functions)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "decf7710", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='', additional_kwargs={'function_call': {'name': 'joke', 'arguments': '{\\n \"setup\": \"Why don\\'t bears wear shoes?\",\\n \"punchline\": \"Because they have bear feet!\"\\n}'}}, example=False)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"foo\": \"bears\"}, config={})" + ] + }, + { + "cell_type": "markdown", + "id": "9098c5ed", + "metadata": {}, + "source": [ + "## PromptTemplate + LLM + OutputParser\n", + "\n", + "We can also add in an output parser to easily trasform the raw LLM/ChatModel output into a more workable format" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "cc194c78", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema.output_parser import StrOutputParser\n", + "\n", + "chain = prompt | model | StrOutputParser()" + ] + }, + { + "cell_type": "markdown", + "id": "77acf448", + "metadata": {}, + "source": [ + "Notice that this now returns a string - a much more workable format for downstream tasks" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e3d69a18", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"Why don't bears wear shoes?\\n\\nBecause they have bear feet!\"" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"foo\": \"bears\"})" + ] + }, + { + "cell_type": "markdown", + "id": "c01864e5", + "metadata": {}, + "source": [ + "### Functions Output Parser\n", + "\n", + "When you specify the function to return, you may just want to parse that directly" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ad0dd88e", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser\n", + "\n", + "chain = (\n", + " prompt \n", + " | model.bind(function_call= {\"name\": \"joke\"}, functions= functions) \n", + " | JsonOutputFunctionsParser()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "1e7aa8eb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'setup': \"Why don't bears like fast food?\",\n", + " 'punchline': \"Because they can't catch it!\"}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"foo\": \"bears\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d4aa1a01", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser\n", + "\n", + "chain = (\n", + " prompt \n", + " | model.bind(function_call= {\"name\": \"joke\"}, functions= functions) \n", + " | JsonKeyOutputFunctionsParser(key_name=\"setup\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8b6df9ba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"Why don't bears wear shoes?\"" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"foo\": \"bears\"})" + ] + }, + { + "cell_type": "markdown", + "id": "023fbccb-ef7d-489e-a9ba-f98e17283d51", + "metadata": {}, + "source": [ + "## Simplifying input\n", + "\n", + "To make invocation even simpler, we can add a `RunnableMap` to take care of creating the prompt input dict for us:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9601c0f0-71f9-4bd4-a672-7bd04084b018", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema.runnable import RunnableMap, RunnablePassthrough\n", + "\n", + "map_ = RunnableMap({\"foo\": RunnablePassthrough()})\n", + "chain = (\n", + " map_ \n", + " | prompt\n", + " | model.bind(function_call= {\"name\": \"joke\"}, functions= functions) \n", + " | JsonKeyOutputFunctionsParser(key_name=\"setup\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7ec4f154-fda5-4847-9220-41aa902fdc33", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"Why don't bears wear shoes?\"" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke(\"bears\")" + ] + }, + { + "cell_type": "markdown", + "id": "def00bfe-0f83-4805-8c8f-8a53f99fa8ea", + "metadata": {}, + "source": [ + "Since we're composing our map with another Runnable, we can even use some syntactic sugar and just use a dict:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "7bf3846a-02ee-41a3-ba1b-a708827d4f3a", + "metadata": {}, + "outputs": [], + "source": [ + "chain = (\n", + " {\"foo\": RunnablePassthrough()} \n", + " | prompt\n", + " | model.bind(function_call= {\"name\": \"joke\"}, functions= functions) \n", + " | JsonKeyOutputFunctionsParser(key_name=\"setup\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "e566d6a1-538d-4cb5-a210-a63e082e4c74", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"Why don't bears like fast food?\"" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke(\"bears\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/cookbook/retrieval.ipynb b/docs/extras/expression_language/cookbook/retrieval.ipynb new file mode 100644 index 0000000000..6579b1c7f3 --- /dev/null +++ b/docs/extras/expression_language/cookbook/retrieval.ipynb @@ -0,0 +1,461 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "abe47592-909c-4844-bf44-9e55c2fb4bfa", + "metadata": {}, + "source": [ + "---\n", + "sidebar_position: 1\n", + "title: RAG\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "91c5ef3d", + "metadata": {}, + "source": [ + "Let's look at adding in a retrieval step to a prompt and LLM, which adds up to a \"retrieval-augmented generation\" chain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f25d9e9-d192-42e9-af50-5660a4bfb0d9", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install langchain openai faiss-cpu" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "33be32af", + "metadata": {}, + "outputs": [], + "source": [ + "from operator import itemgetter\n", + "\n", + "from langchain.prompts import ChatPromptTemplate\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.embeddings import OpenAIEmbeddings\n", + "from langchain.schema.output_parser import StrOutputParser\n", + "from langchain.schema.runnable import RunnablePassthrough\n", + "from langchain.vectorstores import FAISS" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bfc47ec1", + "metadata": {}, + "outputs": [], + "source": [ + "vectorstore = FAISS.from_texts([\"harrison worked at kensho\"], embedding=OpenAIEmbeddings())\n", + "retriever = vectorstore.as_retriever()\n", + "\n", + "template = \"\"\"Answer the question based only on the following context:\n", + "{context}\n", + "\n", + "Question: {question}\n", + "\"\"\"\n", + "prompt = ChatPromptTemplate.from_template(template)\n", + "\n", + "model = ChatOpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "eae31755", + "metadata": {}, + "outputs": [], + "source": [ + "chain = (\n", + " {\"context\": retriever, \"question\": RunnablePassthrough()} \n", + " | prompt \n", + " | model \n", + " | StrOutputParser()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f3040b0c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Harrison worked at Kensho.'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke(\"where did harrison work?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e1d20c7c", + "metadata": {}, + "outputs": [], + "source": [ + "template = \"\"\"Answer the question based only on the following context:\n", + "{context}\n", + "\n", + "Question: {question}\n", + "\n", + "Answer in the following language: {language}\n", + "\"\"\"\n", + "prompt = ChatPromptTemplate.from_template(template)\n", + "\n", + "chain = {\n", + " \"context\": itemgetter(\"question\") | retriever, \n", + " \"question\": itemgetter(\"question\"), \n", + " \"language\": itemgetter(\"language\")\n", + "} | prompt | model | StrOutputParser()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7ee8b2d4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Harrison ha lavorato a Kensho.'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"question\": \"where did harrison work\", \"language\": \"italian\"})" + ] + }, + { + "cell_type": "markdown", + "id": "f007669c", + "metadata": {}, + "source": [ + "## Conversational Retrieval Chain\n", + "\n", + "We can easily add in conversation history. This primarily means adding in chat_message_history" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3f30c348", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema.runnable import RunnableMap\n", + "from langchain.schema import format_document" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "64ab1dbf", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.prompts.prompt import PromptTemplate\n", + "\n", + "_template = \"\"\"Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n", + "\n", + "Chat History:\n", + "{chat_history}\n", + "Follow Up Input: {question}\n", + "Standalone question:\"\"\"\n", + "CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7d628c97", + "metadata": {}, + "outputs": [], + "source": [ + "template = \"\"\"Answer the question based only on the following context:\n", + "{context}\n", + "\n", + "Question: {question}\n", + "\"\"\"\n", + "ANSWER_PROMPT = ChatPromptTemplate.from_template(template)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f60a5d0f", + "metadata": {}, + "outputs": [], + "source": [ + "DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template=\"{page_content}\")\n", + "def _combine_documents(docs, document_prompt = DEFAULT_DOCUMENT_PROMPT, document_separator=\"\\n\\n\"):\n", + " doc_strings = [format_document(doc, document_prompt) for doc in docs]\n", + " return document_separator.join(doc_strings)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7d007db6", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Tuple, List\n", + "def _format_chat_history(chat_history: List[Tuple]) -> str:\n", + " buffer = \"\"\n", + " for dialogue_turn in chat_history:\n", + " human = \"Human: \" + dialogue_turn[0]\n", + " ai = \"Assistant: \" + dialogue_turn[1]\n", + " buffer += \"\\n\" + \"\\n\".join([human, ai])\n", + " return buffer" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5c32cc89", + "metadata": {}, + "outputs": [], + "source": [ + "_inputs = RunnableMap(\n", + " {\n", + " \"standalone_question\": {\n", + " \"question\": lambda x: x[\"question\"],\n", + " \"chat_history\": lambda x: _format_chat_history(x['chat_history'])\n", + " } | CONDENSE_QUESTION_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),\n", + " }\n", + ")\n", + "_context = {\n", + " \"context\": itemgetter(\"standalone_question\") | retriever | _combine_documents,\n", + " \"question\": lambda x: x[\"standalone_question\"]\n", + "}\n", + "conversational_qa_chain = _inputs | _context | ANSWER_PROMPT | ChatOpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "135c8205", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conversational_qa_chain.invoke({\n", + " \"question\": \"where did harrison work?\",\n", + " \"chat_history\": [],\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "424e7e7a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='Harrison worked at Kensho.', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conversational_qa_chain.invoke({\n", + " \"question\": \"where did he work?\",\n", + " \"chat_history\": [(\"Who wrote this notebook?\", \"Harrison\")],\n", + "})" + ] + }, + { + "cell_type": "markdown", + "id": "c5543183", + "metadata": {}, + "source": [ + "### With Memory and returning source documents\n", + "\n", + "This shows how to use memory with the above. For memory, we need to manage that outside at the memory. For returning the retrieved documents, we just need to pass them through all the way." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e31dd17c", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.memory import ConversationBufferMemory" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d4bffe94", + "metadata": {}, + "outputs": [], + "source": [ + "memory = ConversationBufferMemory(return_messages=True, output_key=\"answer\", input_key=\"question\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "733be985", + "metadata": {}, + "outputs": [], + "source": [ + "# First we add a step to load memory\n", + "# This needs to be a RunnableMap because its the first input\n", + "loaded_memory = RunnableMap(\n", + " {\n", + " \"question\": itemgetter(\"question\"),\n", + " \"memory\": memory.load_memory_variables,\n", + " }\n", + ")\n", + "# Next we add a step to expand memory into the variables\n", + "expanded_memory = {\n", + " \"question\": itemgetter(\"question\"),\n", + " \"chat_history\": lambda x: x[\"memory\"][\"history\"]\n", + "}\n", + "\n", + "# Now we calculate the standalone question\n", + "standalone_question = {\n", + " \"standalone_question\": {\n", + " \"question\": lambda x: x[\"question\"],\n", + " \"chat_history\": lambda x: _format_chat_history(x['chat_history'])\n", + " } | CONDENSE_QUESTION_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),\n", + "}\n", + "# Now we retrieve the documents\n", + "retrieved_documents = {\n", + " \"docs\": itemgetter(\"standalone_question\") | retriever,\n", + " \"question\": lambda x: x[\"standalone_question\"]\n", + "}\n", + "# Now we construct the inputs for the final prompt\n", + "final_inputs = {\n", + " \"context\": lambda x: _combine_documents(x[\"docs\"]),\n", + " \"question\": itemgetter(\"question\")\n", + "}\n", + "# And finally, we do the part that returns the answers\n", + "answer = {\n", + " \"answer\": final_inputs | ANSWER_PROMPT | ChatOpenAI(),\n", + " \"docs\": itemgetter(\"docs\"),\n", + "}\n", + "# And now we put it all together!\n", + "final_chain = loaded_memory | expanded_memory | standalone_question | retrieved_documents | answer" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "806e390c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'answer': AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False),\n", + " 'docs': [Document(page_content='harrison worked at kensho', metadata={})]}" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs = {\"question\": \"where did harrison work?\"}\n", + "result = final_chain.invoke(inputs)\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "977399fd", + "metadata": {}, + "outputs": [], + "source": [ + "# Note that the memory does not save automatically\n", + "# This will be improved in the future\n", + "# For now you need to save it yourself\n", + "memory.save_context(inputs, {\"answer\": result[\"answer\"].content})" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "f94f7de4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'history': [HumanMessage(content='where did harrison work?', additional_kwargs={}, example=False),\n", + " AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False)]}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "memory.load_memory_variables({})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "poetry-venv", + "language": "python", + "name": "poetry-venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/cookbook/sql_db.ipynb b/docs/extras/expression_language/cookbook/sql_db.ipynb new file mode 100644 index 0000000000..0cf0748009 --- /dev/null +++ b/docs/extras/expression_language/cookbook/sql_db.ipynb @@ -0,0 +1,227 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "c14da114-1a4a-487d-9cff-e0e8c30ba366", + "metadata": {}, + "source": [ + "---\n", + "sidebar_position: 3\n", + "title: Querying a SQL DB\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "506e9636", + "metadata": {}, + "source": [ + "We can replicate our SQLDatabaseChain with Runnables." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7a927516", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.prompts import ChatPromptTemplate\n", + "\n", + "template = \"\"\"Based on the table schema below, write a SQL query that would answer the user's question:\n", + "{schema}\n", + "\n", + "Question: {question}\n", + "SQL Query:\"\"\"\n", + "prompt = ChatPromptTemplate.from_template(template)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3f51f386", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.utilities import SQLDatabase" + ] + }, + { + "cell_type": "markdown", + "id": "7c3449d6-684b-416e-ba16-90a035835a88", + "metadata": {}, + "source": [ + "We'll need the Chinook sample DB for this example. There's many places to download it from, e.g. https://database.guide/2-sample-databases-sqlite/" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "2ccca6fc", + "metadata": {}, + "outputs": [], + "source": [ + "db = SQLDatabase.from_uri(\"sqlite:///./Chinook.db\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "05ba88ee", + "metadata": {}, + "outputs": [], + "source": [ + "def get_schema(_):\n", + " return db.get_table_info()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "a4eda902", + "metadata": {}, + "outputs": [], + "source": [ + "def run_query(query):\n", + " return db.run(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "5046cb17", + "metadata": {}, + "outputs": [], + "source": [ + "from operator import itemgetter\n", + "\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.schema.output_parser import StrOutputParser\n", + "from langchain.schema.runnable import RunnableLambda, RunnableMap\n", + "\n", + "model = ChatOpenAI()\n", + "\n", + "inputs = {\n", + " \"schema\": RunnableLambda(get_schema),\n", + " \"question\": itemgetter(\"question\")\n", + "}\n", + "sql_response = (\n", + " RunnableMap(inputs)\n", + " | prompt\n", + " | model.bind(stop=[\"\\nSQLResult:\"])\n", + " | StrOutputParser()\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "a5552039", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'SELECT COUNT(*) FROM Employee'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sql_response.invoke({\"question\": \"How many employees are there?\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "d6fee130", + "metadata": {}, + "outputs": [], + "source": [ + "template = \"\"\"Based on the table schema below, question, sql query, and sql response, write a natural language response:\n", + "{schema}\n", + "\n", + "Question: {question}\n", + "SQL Query: {query}\n", + "SQL Response: {response}\"\"\"\n", + "prompt_response = ChatPromptTemplate.from_template(template)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "923aa634", + "metadata": {}, + "outputs": [], + "source": [ + "full_chain = (\n", + " RunnableMap({\n", + " \"question\": itemgetter(\"question\"),\n", + " \"query\": sql_response,\n", + " }) \n", + " | {\n", + " \"schema\": RunnableLambda(get_schema),\n", + " \"question\": itemgetter(\"question\"),\n", + " \"query\": itemgetter(\"query\"),\n", + " \"response\": lambda x: db.run(x[\"query\"]) \n", + " } \n", + " | prompt_response \n", + " | model\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "e94963d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='There are 8 employees.', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "full_chain.invoke({\"question\": \"How many employees are there?\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f358d7b-a721-4db3-9f92-f06913428afc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/cookbook/tools.ipynb b/docs/extras/expression_language/cookbook/tools.ipynb new file mode 100644 index 0000000000..d13dece3c9 --- /dev/null +++ b/docs/extras/expression_language/cookbook/tools.ipynb @@ -0,0 +1,122 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "29781123", + "metadata": {}, + "source": [ + "# Using tools\n", + "\n", + "You can use any Tools with Runnables easily." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5c579dd-2e22-41b0-a789-346dfdecb5a2", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install duckduckgo-search" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9232d2a9", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.prompts import ChatPromptTemplate\n", + "from langchain.schema.output_parser import StrOutputParser\n", + "from langchain.tools import DuckDuckGoSearchRun" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a0c64d2c", + "metadata": {}, + "outputs": [], + "source": [ + "search = DuckDuckGoSearchRun()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "391969b6", + "metadata": {}, + "outputs": [], + "source": [ + "template = \"\"\"turn the following user input into a search query for a search engine:\n", + "\n", + "{input}\"\"\"\n", + "prompt = ChatPromptTemplate.from_template(template)\n", + "\n", + "model = ChatOpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e3d9d20d", + "metadata": {}, + "outputs": [], + "source": [ + "chain = prompt | model | StrOutputParser() | search" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "55f2967d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'What sports games are on TV today & tonight? Watch and stream live sports on TV today, tonight, tomorrow. Today\\'s 2023 sports TV schedule includes football, basketball, baseball, hockey, motorsports, soccer and more. Watch on TV or stream online on ESPN, FOX, FS1, CBS, NBC, ABC, Peacock, Paramount+, fuboTV, local channels and many other networks. MLB Games Tonight: How to Watch on TV, Streaming & Odds - Thursday, September 7. Seattle Mariners\\' Julio Rodriguez greets teammates in the dugout after scoring against the Oakland Athletics in a ... Circle - Country Music and Lifestyle. Live coverage of all the MLB action today is available to you, with the information provided below. The Brewers will look to pick up a road win at PNC Park against the Pirates on Wednesday at 12:35 PM ET. Check out the latest odds and with BetMGM Sportsbook. Use bonus code \"GNPLAY\" for special offers! MLB Games Tonight: How to Watch on TV, Streaming & Odds - Tuesday, September 5. Houston Astros\\' Kyle Tucker runs after hitting a double during the fourth inning of a baseball game against the Los Angeles Angels, Sunday, Aug. 13, 2023, in Houston. (AP Photo/Eric Christian Smith) (APMedia) The Houston Astros versus the Texas Rangers is one of ... The second half of tonight\\'s college football schedule still has some good games remaining to watch on your television.. We\\'ve already seen an exciting one when Colorado upset TCU. And we saw some ...'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"input\": \"I'd like to figure out what games are tonight\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a16949cf-00ea-43c6-a6aa-797ad4f6918d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "poetry-venv", + "language": "python", + "name": "poetry-venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/how_to/_category_.yml b/docs/extras/expression_language/how_to/_category_.yml new file mode 100644 index 0000000000..39fa22bfbf --- /dev/null +++ b/docs/extras/expression_language/how_to/_category_.yml @@ -0,0 +1,2 @@ +label: 'How to' +position: 1 \ No newline at end of file diff --git a/docs/extras/expression_language/how_to/functions.ipynb b/docs/extras/expression_language/how_to/functions.ipynb new file mode 100644 index 0000000000..fc2f0a2962 --- /dev/null +++ b/docs/extras/expression_language/how_to/functions.ipynb @@ -0,0 +1,158 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fbc4bf6e", + "metadata": {}, + "source": [ + "# Run arbitrary functions\n", + "\n", + "You can use arbitrary functions in the pipeline\n", + "\n", + "Note that all inputs to these functions need to be a SINGLE argument. If you have a function that accepts multiple arguments, you should write a wrapper that accepts a single input and unpacks it into multiple argument." + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "6bb221b3", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema.runnable import RunnableLambda\n", + "\n", + "def length_function(text):\n", + " return len(text)\n", + "\n", + "def _multiple_length_function(text1, text2):\n", + " return len(text1) * len(text2)\n", + "\n", + "def multiple_length_function(_dict):\n", + " return _multiple_length_function(_dict[\"text1\"], _dict[\"text2\"])\n", + "\n", + "prompt = ChatPromptTemplate.from_template(\"what is {a} + {b}\")\n", + "\n", + "chain1 = prompt | model\n", + "\n", + "chain = {\n", + " \"a\": itemgetter(\"foo\") | RunnableLambda(length_function),\n", + " \"b\": {\"text1\": itemgetter(\"foo\"), \"text2\": itemgetter(\"bar\")} | RunnableLambda(multiple_length_function)\n", + "} | prompt | model" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "5488ec85", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='3 + 9 equals 12.', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"foo\": \"bar\", \"bar\": \"gah\"})" + ] + }, + { + "cell_type": "markdown", + "id": "4728ddd9-914d-42ce-ae9b-72c9ce8ec940", + "metadata": {}, + "source": [ + "## Accepting a Runnable Config\n", + "\n", + "Runnable lambdas can optionally accept a [RunnableConfig](https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.config.RunnableConfig.html?highlight=runnableconfig#langchain.schema.runnable.config.RunnableConfig), which they can use to pass callbacks, tags, and other configuration information to nested runs." + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "id": "80b3b5f6-5d58-44b9-807e-cce9a46bf49f", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema.runnable import RunnableConfig" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "id": "ff0daf0c-49dd-4d21-9772-e5fa133c5f36", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "def parse_or_fix(text: str, config: RunnableConfig):\n", + " fixing_chain = (\n", + " ChatPromptTemplate.from_template(\n", + " \"Fix the following text:\\n\\n```text\\n{input}\\n```\\nError: {error}\"\n", + " \" Don't narrate, just respond with the fixed data.\"\n", + " )\n", + " | ChatOpenAI()\n", + " | StrOutputParser()\n", + " )\n", + " for _ in range(3):\n", + " try:\n", + " return json.loads(text)\n", + " except Exception as e:\n", + " text = fixing_chain.invoke({\"input\": text, \"error\": e}, config)\n", + " return \"Failed to parse\"" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "id": "1a5e709e-9d75-48c7-bb9c-503251990505", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tokens Used: 65\n", + "\tPrompt Tokens: 56\n", + "\tCompletion Tokens: 9\n", + "Successful Requests: 1\n", + "Total Cost (USD): $0.00010200000000000001\n" + ] + } + ], + "source": [ + "from langchain.callbacks import get_openai_callback\n", + "\n", + "with get_openai_callback() as cb:\n", + " RunnableLambda(parse_or_fix).invoke(\"{foo: bar}\", {\"tags\": [\"my-tag\"], \"callbacks\": [cb]})\n", + " print(cb)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/interface.ipynb b/docs/extras/expression_language/interface.ipynb index cf19bfe4db..c47800ecad 100644 --- a/docs/extras/expression_language/interface.ipynb +++ b/docs/extras/expression_language/interface.ipynb @@ -1,12 +1,21 @@ { "cells": [ + { + "cell_type": "raw", + "id": "366a0e68-fd67-4fe5-a292-5c33733339ea", + "metadata": {}, + "source": [ + "---\n", + "sidebar_position: 0\n", + "title: Interface\n", + "---" + ] + }, { "cell_type": "markdown", "id": "9a9acd2e", "metadata": {}, "source": [ - "# Interface\n", - "\n", "In an effort to make it as easy as possible to create custom chains, we've implemented a [\"Runnable\"](https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.Runnable.html#langchain.schema.runnable.Runnable) protocol that most components implement. This is a standard interface with a few different methods, which makes it easy to define custom chains as well as making it possible to invoke them in a standard way. The standard interface exposed includes:\n", "\n", "- `stream`: stream back chunks of the response\n", @@ -429,7 +438,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.1" + "version": "3.9.1" } }, "nbformat": 4, From b1d40b8626fba2fbd75699c8dd321058952f5354 Mon Sep 17 00:00:00 2001 From: kcocco <1429579+kcocco@users.noreply.github.com> Date: Thu, 7 Sep 2023 15:57:27 -0600 Subject: [PATCH 07/13] =?UTF-8?q?Fix=20colab=20link(missing=20graph=20in?= =?UTF-8?q?=20url)=20and=20comment=20to=20match=20the=20code=20fo=E2=80=A6?= =?UTF-8?q?=20(#10344)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Description: Fixing Colab broken link and comment correction to align with the code that uses Warren Buffet for wiki query - Issue: None open - Dependencies: none - Tag maintainer: n/a - Twitter handle: Not a PR change but: kcocco --- .../use_cases/more/graph/diffbot_graphtransformer.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/extras/use_cases/more/graph/diffbot_graphtransformer.ipynb b/docs/extras/use_cases/more/graph/diffbot_graphtransformer.ipynb index da1c2fc020..f8961174ce 100644 --- a/docs/extras/use_cases/more/graph/diffbot_graphtransformer.ipynb +++ b/docs/extras/use_cases/more/graph/diffbot_graphtransformer.ipynb @@ -7,7 +7,7 @@ "source": [ "# Diffbot Graph Transformer\n", "\n", - "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/more/graph/diffbot_transformer.ipynb)\n", + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/more/graph/diffbot_graphtransformer.ipynb)\n", "\n", "## Use case\n", "\n", @@ -77,7 +77,7 @@ "id": "5e3b894a-e3ee-46c7-8116-f8377f8f0159", "metadata": {}, "source": [ - "This code fetches Wikipedia articles about \"Baldur's Gate 3\" and then uses `DiffbotGraphTransformer` to extract entities and relationships.\n", + "This code fetches Wikipedia articles about \"Warren Buffett\" and then uses `DiffbotGraphTransformer` to extract entities and relationships.\n", "The `DiffbotGraphTransformer` outputs a structured data `GraphDocument`, which can be used to populate a graph database.\n", "Note that text chunking is avoided due to Diffbot's [character limit per API request](https://docs.diffbot.com/reference/introduction-to-natural-language-api)." ] From 20c742d8a254466f080268b3f15dd9d99204935a Mon Sep 17 00:00:00 2001 From: Tze Min <40569118+tmin97@users.noreply.github.com> Date: Fri, 8 Sep 2023 05:58:28 +0800 Subject: [PATCH 08/13] Enhancement: add parameter boto3_session for AWS DynamoDB cross account use cases (#10326) - Description: to allow boto3 assume role for AWS cross account use cases to read and update the chat history, - Issue: use case I faced in my company, - Dependencies: no - Tag maintainer: @baskaryan , - Twitter handle: @tmin97 --------- Co-authored-by: Bagatur --- .../memory/chat_message_histories/dynamodb.py | 46 +++++++++++++++---- 1 file changed, 37 insertions(+), 9 deletions(-) diff --git a/libs/langchain/langchain/memory/chat_message_histories/dynamodb.py b/libs/langchain/langchain/memory/chat_message_histories/dynamodb.py index 704efa9ea7..06d7897dbd 100644 --- a/libs/langchain/langchain/memory/chat_message_histories/dynamodb.py +++ b/libs/langchain/langchain/memory/chat_message_histories/dynamodb.py @@ -1,5 +1,7 @@ +from __future__ import annotations + import logging -from typing import Dict, List, Optional +from typing import TYPE_CHECKING, Dict, List, Optional from langchain.schema import ( BaseChatMessageHistory, @@ -11,6 +13,9 @@ from langchain.schema.messages import ( messages_to_dict, ) +if TYPE_CHECKING: + from boto3.session import Session + logger = logging.getLogger(__name__) @@ -42,13 +47,21 @@ class DynamoDBChatMessageHistory(BaseChatMessageHistory): endpoint_url: Optional[str] = None, primary_key_name: str = "SessionId", key: Optional[Dict[str, str]] = None, + boto3_session: Optional[Session] = None, ): - import boto3 - - if endpoint_url: - client = boto3.resource("dynamodb", endpoint_url=endpoint_url) + if boto3_session: + client = boto3_session.resource("dynamodb") else: - client = boto3.resource("dynamodb") + try: + import boto3 + except ImportError as e: + raise ImportError( + "Unable to import boto3, please install with `pip install boto3`." + ) from e + if endpoint_url: + client = boto3.resource("dynamodb", endpoint_url=endpoint_url) + else: + client = boto3.resource("dynamodb") self.table = client.Table(table_name) self.session_id = session_id self.key: Dict = key or {primary_key_name: session_id} @@ -56,7 +69,12 @@ class DynamoDBChatMessageHistory(BaseChatMessageHistory): @property def messages(self) -> List[BaseMessage]: # type: ignore """Retrieve the messages from DynamoDB""" - from botocore.exceptions import ClientError + try: + from botocore.exceptions import ClientError + except ImportError as e: + raise ImportError( + "Unable to import botocore, please install with `pip install botocore`." + ) from e response = None try: @@ -77,7 +95,12 @@ class DynamoDBChatMessageHistory(BaseChatMessageHistory): def add_message(self, message: BaseMessage) -> None: """Append the message to the record in DynamoDB""" - from botocore.exceptions import ClientError + try: + from botocore.exceptions import ClientError + except ImportError as e: + raise ImportError( + "Unable to import botocore, please install with `pip install botocore`." + ) from e messages = messages_to_dict(self.messages) _message = _message_to_dict(message) @@ -90,7 +113,12 @@ class DynamoDBChatMessageHistory(BaseChatMessageHistory): def clear(self) -> None: """Clear session memory from DynamoDB""" - from botocore.exceptions import ClientError + try: + from botocore.exceptions import ClientError + except ImportError as e: + raise ImportError( + "Unable to import botocore, please install with `pip install botocore`." + ) from e try: self.table.delete_item(self.key) From 300559695b4d01df952e629753506d6ade49b1b3 Mon Sep 17 00:00:00 2001 From: Greg Richardson Date: Thu, 7 Sep 2023 16:03:26 -0600 Subject: [PATCH 09/13] Supabase vector self querying retriever (#10304) ## Description Adds Supabase Vector as a self-querying retriever. - Designed to be backwards compatible with existing `filter` logic on `SupabaseVectorStore`. - Adds new filter `postgrest_filter` to `SupabaseVectorStore` `similarity_search()` methods - Supports entire PostgREST [filter query language](https://postgrest.org/en/stable/references/api/tables_views.html#read) (used by self-querying retriever, but also works as an escape hatch for more query control) - `SupabaseVectorTranslator` converts Langchain filter into the above PostgREST query - Adds Jupyter Notebook for the self-querying retriever - Adds tests ## Tag maintainer @hwchase17 ## Twitter handle [@ggrdson](https://twitter.com/ggrdson) --- .../integrations/vectorstores/supabase.ipynb | 68 +- .../self_query/supabase_self_query.ipynb | 587 ++++++++++++++++++ .../langchain/retrievers/self_query/base.py | 3 + .../retrievers/self_query/supabase.py | 97 +++ .../langchain/vectorstores/supabase.py | 42 +- .../retrievers/self_query/test_supabase.py | 85 +++ 6 files changed, 839 insertions(+), 43 deletions(-) create mode 100644 docs/extras/modules/data_connection/retrievers/self_query/supabase_self_query.ipynb create mode 100644 libs/langchain/langchain/retrievers/self_query/supabase.py create mode 100644 libs/langchain/tests/unit_tests/retrievers/self_query/test_supabase.py diff --git a/docs/extras/integrations/vectorstores/supabase.ipynb b/docs/extras/integrations/vectorstores/supabase.ipynb index 9ba5dacd08..9a5f583adb 100644 --- a/docs/extras/integrations/vectorstores/supabase.ipynb +++ b/docs/extras/integrations/vectorstores/supabase.ipynb @@ -28,43 +28,41 @@ "The following function determines cosine similarity, but you can adjust to your needs.\n", "\n", "```sql\n", - " -- Enable the pgvector extension to work with embedding vectors\n", - " create extension vector;\n", + "-- Enable the pgvector extension to work with embedding vectors\n", + "create extension if not exists vector;\n", "\n", - " -- Create a table to store your documents\n", - " create table documents (\n", - " id uuid primary key,\n", - " content text, -- corresponds to Document.pageContent\n", - " metadata jsonb, -- corresponds to Document.metadata\n", - " embedding vector(1536) -- 1536 works for OpenAI embeddings, change if needed\n", - " );\n", + "-- Create a table to store your documents\n", + "create table\n", + " documents (\n", + " id uuid primary key,\n", + " content text, -- corresponds to Document.pageContent\n", + " metadata jsonb, -- corresponds to Document.metadata\n", + " embedding vector (1536) -- 1536 works for OpenAI embeddings, change if needed\n", + " );\n", "\n", - " CREATE FUNCTION match_documents(query_embedding vector(1536), match_count int)\n", - " RETURNS TABLE(\n", - " id uuid,\n", - " content text,\n", - " metadata jsonb,\n", - " -- we return matched vectors to enable maximal marginal relevance searches\n", - " embedding vector(1536),\n", - " similarity float)\n", - " LANGUAGE plpgsql\n", - " AS $$\n", - " # variable_conflict use_column\n", - " BEGIN\n", - " RETURN query\n", - " SELECT\n", - " id,\n", - " content,\n", - " metadata,\n", - " embedding,\n", - " 1 -(documents.embedding <=> query_embedding) AS similarity\n", - " FROM\n", - " documents\n", - " ORDER BY\n", - " documents.embedding <=> query_embedding\n", - " LIMIT match_count;\n", - " END;\n", - " $$;\n", + "-- Create a function to search for documents\n", + "create function match_documents (\n", + " query_embedding vector (1536),\n", + " filter jsonb default '{}'\n", + ") returns table (\n", + " id uuid,\n", + " content text,\n", + " metadata jsonb,\n", + " similarity float\n", + ") language plpgsql as $$\n", + "#variable_conflict use_column\n", + "begin\n", + " return query\n", + " select\n", + " id,\n", + " content,\n", + " metadata,\n", + " 1 - (documents.embedding <=> query_embedding) as similarity\n", + " from documents\n", + " where metadata @> filter\n", + " order by documents.embedding <=> query_embedding;\n", + "end;\n", + "$$;\n", "```" ] }, diff --git a/docs/extras/modules/data_connection/retrievers/self_query/supabase_self_query.ipynb b/docs/extras/modules/data_connection/retrievers/self_query/supabase_self_query.ipynb new file mode 100644 index 0000000000..1414f70d38 --- /dev/null +++ b/docs/extras/modules/data_connection/retrievers/self_query/supabase_self_query.ipynb @@ -0,0 +1,587 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "13afcae7", + "metadata": {}, + "source": [ + "# Supabase Vector self-querying \n", + "\n", + ">[Supabase](https://supabase.com/docs) is an open source `Firebase` alternative. \n", + "> `Supabase` is built on top of `PostgreSQL`, which offers strong `SQL` \n", + "> querying capabilities and enables a simple interface with already-existing tools and frameworks.\n", + "\n", + ">[PostgreSQL](https://en.wikipedia.org/wiki/PostgreSQL) also known as `Postgres`,\n", + "> is a free and open-source relational database management system (RDBMS) \n", + "> emphasizing extensibility and `SQL` compliance.\n", + "\n", + "In the notebook we'll demo the `SelfQueryRetriever` wrapped around a Supabase vector store.\n", + "\n", + "Specifically we will:\n", + "1. Create a Supabase database\n", + "2. Enable the `pgvector` extension\n", + "3. Create a `documents` table and `match_documents` function that will be used by `SupabaseVectorStore`\n", + "4. Load sample documents into the vector store (database table)\n", + "5. Build and test a self-querying retriever" + ] + }, + { + "cell_type": "markdown", + "id": "347935ad", + "metadata": {}, + "source": [ + "## Setup Supabase Database\n", + "\n", + "1. Head over to https://database.new to provision your Supabase database.\n", + "2. In the studio, jump to the [SQL editor](https://supabase.com/dashboard/project/_/sql/new) and run the following script to enable `pgvector` and setup your database as a vector store:\n", + " ```sql\n", + " -- Enable the pgvector extension to work with embedding vectors\n", + " create extension if not exists vector;\n", + "\n", + " -- Create a table to store your documents\n", + " create table\n", + " documents (\n", + " id uuid primary key,\n", + " content text, -- corresponds to Document.pageContent\n", + " metadata jsonb, -- corresponds to Document.metadata\n", + " embedding vector (1536) -- 1536 works for OpenAI embeddings, change if needed\n", + " );\n", + "\n", + " -- Create a function to search for documents\n", + " create function match_documents (\n", + " query_embedding vector (1536),\n", + " filter jsonb default '{}'\n", + " ) returns table (\n", + " id uuid,\n", + " content text,\n", + " metadata jsonb,\n", + " similarity float\n", + " ) language plpgsql as $$\n", + " #variable_conflict use_column\n", + " begin\n", + " return query\n", + " select\n", + " id,\n", + " content,\n", + " metadata,\n", + " 1 - (documents.embedding <=> query_embedding) as similarity\n", + " from documents\n", + " where metadata @> filter\n", + " order by documents.embedding <=> query_embedding;\n", + " end;\n", + " $$;\n", + " ```" + ] + }, + { + "cell_type": "markdown", + "id": "68e75fb9", + "metadata": {}, + "source": [ + "## Creating a Supabase vector store\n", + "Next we'll want to create a Supabase vector store and seed it with some data. We've created a small demo set of documents that contain summaries of movies.\n", + "\n", + "Be sure to install the latest version of `langchain`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78546fd7", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install langchain" + ] + }, + { + "cell_type": "markdown", + "id": "e06df198", + "metadata": {}, + "source": [ + "The self-query retriever requires you to have `lark` installed:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63a8af5b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%pip install lark" + ] + }, + { + "cell_type": "markdown", + "id": "114f768f", + "metadata": {}, + "source": [ + "We also need the `openai` and `supabase` packages:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "434ae558", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install openai" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22431060-52c4-48a7-a97b-9f542b8b0928", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%pip install supabase==1.0.0" + ] + }, + { + "cell_type": "markdown", + "id": "83811610-7df3-4ede-b268-68a6a83ba9e2", + "metadata": {}, + "source": [ + "Since we are using `SupabaseVectorStore` and `OpenAIEmbeddings`, we have to load their API keys.\n", + "\n", + "- To find your `SUPABASE_URL` and `SUPABASE_SERVICE_KEY`, head to your Supabase project's [API settings](https://supabase.com/dashboard/project/_/settings/api).\n", + " - `SUPABASE_URL` corresponds to the Project URL\n", + " - `SUPABASE_SERVICE_KEY` corresponds to the `service_role` API key\n", + "\n", + "- To get your `OPENAI_API_KEY`, navigate to [API keys](https://platform.openai.com/account/api-keys) on your OpenAI account and create a new secret key." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dd01b61b-7d32-4a55-85d6-b2d2d4f18840", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "import getpass\n", + "\n", + "os.environ[\"SUPABASE_URL\"] = getpass.getpass(\"Supabase URL:\")\n", + "os.environ[\"SUPABASE_SERVICE_KEY\"] = getpass.getpass(\"Supabase Service Key:\")\n", + "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")" + ] + }, + { + "cell_type": "markdown", + "id": "3aaf5075", + "metadata": {}, + "source": [ + "_Optional:_ If you're storing your Supabase and OpenAI API keys in a `.env` file, you can load them with [`dotenv`](https://github.com/theskumar/python-dotenv)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0089221", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install python-dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d56c5ef", + "metadata": {}, + "outputs": [], + "source": [ + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()" + ] + }, + { + "cell_type": "markdown", + "id": "f6dd9aef", + "metadata": {}, + "source": [ + "First we'll create a Supabase client and instantiate a OpenAI embeddings class." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "cb4a5787", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "from supabase.client import Client, create_client\n", + "from langchain.schema import Document\n", + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.vectorstores import SupabaseVectorStore\n", + "\n", + "supabase_url = os.environ.get(\"SUPABASE_URL\")\n", + "supabase_key = os.environ.get(\"SUPABASE_SERVICE_KEY\")\n", + "supabase: Client = create_client(supabase_url, supabase_key)\n", + "\n", + "embeddings = OpenAIEmbeddings()" + ] + }, + { + "cell_type": "markdown", + "id": "0fca9b0b", + "metadata": {}, + "source": [ + "Next let's create our documents." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bcbe04d9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "docs = [\n", + " Document(\n", + " page_content=\"A bunch of scientists bring back dinosaurs and mayhem breaks loose\",\n", + " metadata={\"year\": 1993, \"rating\": 7.7, \"genre\": \"science fiction\"},\n", + " ),\n", + " Document(\n", + " page_content=\"Leo DiCaprio gets lost in a dream within a dream within a dream within a ...\",\n", + " metadata={\"year\": 2010, \"director\": \"Christopher Nolan\", \"rating\": 8.2},\n", + " ),\n", + " Document(\n", + " page_content=\"A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea\",\n", + " metadata={\"year\": 2006, \"director\": \"Satoshi Kon\", \"rating\": 8.6},\n", + " ),\n", + " Document(\n", + " page_content=\"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n", + " metadata={\"year\": 2019, \"director\": \"Greta Gerwig\", \"rating\": 8.3},\n", + " ),\n", + " Document(\n", + " page_content=\"Toys come alive and have a blast doing so\",\n", + " metadata={\"year\": 1995, \"genre\": \"animated\"},\n", + " ),\n", + " Document(\n", + " page_content=\"Three men walk into the Zone, three men walk out of the Zone\",\n", + " metadata={\n", + " \"year\": 1979,\n", + " \"rating\": 9.9,\n", + " \"director\": \"Andrei Tarkovsky\",\n", + " \"genre\": \"science fiction\",\n", + " \"rating\": 9.9,\n", + " },\n", + " ),\n", + "]\n", + "\n", + "vectorstore = SupabaseVectorStore.from_documents(docs, embeddings, client=supabase, table_name=\"documents\", query_name=\"match_documents\")" + ] + }, + { + "cell_type": "markdown", + "id": "5ecaab6d", + "metadata": {}, + "source": [ + "## Creating our self-querying retriever\n", + "Now we can instantiate our retriever. To do this we'll need to provide some information upfront about the metadata fields that our documents support and a short description of the document contents." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "86e34dbf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.llms import OpenAI\n", + "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", + "from langchain.chains.query_constructor.base import AttributeInfo\n", + "\n", + "metadata_field_info = [\n", + " AttributeInfo(\n", + " name=\"genre\",\n", + " description=\"The genre of the movie\",\n", + " type=\"string or list[string]\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"year\",\n", + " description=\"The year the movie was released\",\n", + " type=\"integer\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"director\",\n", + " description=\"The name of the movie director\",\n", + " type=\"string\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"rating\", description=\"A 1-10 rating for the movie\", type=\"float\"\n", + " ),\n", + "]\n", + "document_content_description = \"Brief summary of a movie\"\n", + "llm = OpenAI(temperature=0)\n", + "retriever = SelfQueryRetriever.from_llm(\n", + " llm, vectorstore, document_content_description, metadata_field_info, verbose=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ea9df8d4", + "metadata": {}, + "source": [ + "## Testing it out\n", + "And now we can try actually using our retriever!" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "38a126e9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='dinosaur' filter=None limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'year': 1993, 'genre': 'science fiction', 'rating': 7.7}),\n", + " Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'}),\n", + " Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'genre': 'science fiction', 'rating': 9.9, 'director': 'Andrei Tarkovsky'}),\n", + " Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'year': 2006, 'rating': 8.6, 'director': 'Satoshi Kon'})]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a relevant query\n", + "retriever.get_relevant_documents(\"What are some movies about dinosaurs\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fc3f1e6e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Comparison(comparator=, attribute='rating', value=8.5) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'genre': 'science fiction', 'rating': 9.9, 'director': 'Andrei Tarkovsky'}),\n", + " Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'year': 2006, 'rating': 8.6, 'director': 'Satoshi Kon'})]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a filter\n", + "retriever.get_relevant_documents(\"I want to watch a movie rated higher than 8.5\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b19d4da0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='women' filter=Comparison(comparator=, attribute='director', value='Greta Gerwig') limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them', metadata={'year': 2019, 'rating': 8.3, 'director': 'Greta Gerwig'})]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a query and a filter\n", + "retriever.get_relevant_documents(\"Has Greta Gerwig directed any movies about women?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f900e40e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Operation(operator=, arguments=[Comparison(comparator=, attribute='rating', value=8.5), Comparison(comparator=, attribute='genre', value='science fiction')]) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'genre': 'science fiction', 'rating': 9.9, 'director': 'Andrei Tarkovsky'})]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a composite filter\n", + "retriever.get_relevant_documents(\n", + " \"What's a highly rated (above 8.5) science fiction film?\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "12a51522", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='toys' filter=Operation(operator=, arguments=[Comparison(comparator=, attribute='year', value=1990), Comparison(comparator=, attribute='year', value=2005), Comparison(comparator=, attribute='genre', value='animated')]) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'})]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a query and composite filter\n", + "retriever.get_relevant_documents(\n", + " \"What's a movie after 1990 but before (or on) 2005 that's all about toys, and preferably is animated\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "39bd1de1-b9fe-4a98-89da-58d8a7a6ae51", + "metadata": {}, + "source": [ + "## Filter k\n", + "\n", + "We can also use the self query retriever to specify `k`: the number of documents to fetch.\n", + "\n", + "We can do this by passing `enable_limit=True` to the constructor." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "bff36b88-b506-4877-9c63-e5a1a8d78e64", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "retriever = SelfQueryRetriever.from_llm(\n", + " llm,\n", + " vectorstore,\n", + " document_content_description,\n", + " metadata_field_info,\n", + " enable_limit=True,\n", + " verbose=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2758d229-4f97-499c-819f-888acaf8ee10", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='dinosaur' filter=None limit=2\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'year': 1993, 'genre': 'science fiction', 'rating': 7.7}),\n", + " Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'})]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a relevant query\n", + "retriever.get_relevant_documents(\"what are two movies about dinosaurs\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/langchain/langchain/retrievers/self_query/base.py b/libs/langchain/langchain/retrievers/self_query/base.py index a34c756bd4..62bce34847 100644 --- a/libs/langchain/langchain/retrievers/self_query/base.py +++ b/libs/langchain/langchain/retrievers/self_query/base.py @@ -16,6 +16,7 @@ from langchain.retrievers.self_query.milvus import MilvusTranslator from langchain.retrievers.self_query.myscale import MyScaleTranslator from langchain.retrievers.self_query.pinecone import PineconeTranslator from langchain.retrievers.self_query.qdrant import QdrantTranslator +from langchain.retrievers.self_query.supabase import SupabaseVectorTranslator from langchain.retrievers.self_query.vectara import VectaraTranslator from langchain.retrievers.self_query.weaviate import WeaviateTranslator from langchain.schema import BaseRetriever, Document @@ -29,6 +30,7 @@ from langchain.vectorstores import ( MyScale, Pinecone, Qdrant, + SupabaseVectorStore, Vectara, VectorStore, Weaviate, @@ -49,6 +51,7 @@ def _get_builtin_translator(vectorstore: VectorStore) -> Visitor: DeepLake: DeepLakeTranslator, ElasticsearchStore: ElasticsearchTranslator, Milvus: MilvusTranslator, + SupabaseVectorStore: SupabaseVectorTranslator, } if vectorstore_cls not in BUILTIN_TRANSLATORS: raise ValueError( diff --git a/libs/langchain/langchain/retrievers/self_query/supabase.py b/libs/langchain/langchain/retrievers/self_query/supabase.py new file mode 100644 index 0000000000..267e228fcd --- /dev/null +++ b/libs/langchain/langchain/retrievers/self_query/supabase.py @@ -0,0 +1,97 @@ +from typing import Any, Dict, Tuple + +from langchain.chains.query_constructor.ir import ( + Comparator, + Comparison, + Operation, + Operator, + StructuredQuery, + Visitor, +) + + +class SupabaseVectorTranslator(Visitor): + """Translate Langchain filters to Supabase PostgREST filters.""" + + allowed_operators = [Operator.AND, Operator.OR] + """Subset of allowed logical operators.""" + + allowed_comparators = [ + Comparator.EQ, + Comparator.NE, + Comparator.GT, + Comparator.GTE, + Comparator.LT, + Comparator.LTE, + Comparator.LIKE, + ] + """Subset of allowed logical comparators.""" + + metadata_column = "metadata" + + def _map_comparator(self, comparator: Comparator) -> str: + """ + Maps Langchain comparator to PostgREST comparator: + + https://postgrest.org/en/stable/references/api/tables_views.html#operators + """ + postgrest_comparator = { + Comparator.EQ: "eq", + Comparator.NE: "neq", + Comparator.GT: "gt", + Comparator.GTE: "gte", + Comparator.LT: "lt", + Comparator.LTE: "lte", + Comparator.LIKE: "like", + }.get(comparator) + + if postgrest_comparator is None: + raise Exception( + f"Comparator '{comparator}' is not currently " + "supported in Supabase Vector" + ) + + return postgrest_comparator + + def _get_json_operator(self, value: Any) -> str: + if isinstance(value, str): + return "->>" + else: + return "->" + + def visit_operation(self, operation: Operation) -> str: + args = [arg.accept(self) for arg in operation.arguments] + return f"{operation.operator.value}({','.join(args)})" + + def visit_comparison(self, comparison: Comparison) -> str: + if isinstance(comparison.value, list): + return self.visit_operation( + Operation( + operator=Operator.AND, + arguments=( + Comparison( + comparator=comparison.comparator, + attribute=comparison.attribute, + value=value, + ) + for value in comparison.value + ), + ) + ) + + return ".".join( + [ + f"{self.metadata_column}{self._get_json_operator(comparison.value)}{comparison.attribute}", + f"{self._map_comparator(comparison.comparator)}", + f"{comparison.value}", + ] + ) + + def visit_structured_query( + self, structured_query: StructuredQuery + ) -> Tuple[str, Dict[str, str]]: + if structured_query.filter is None: + kwargs = {} + else: + kwargs = {"postgrest_filter": structured_query.filter.accept(self)} + return structured_query.query, kwargs diff --git a/libs/langchain/langchain/vectorstores/supabase.py b/libs/langchain/langchain/vectorstores/supabase.py index 4214e085f6..71b7c2cc8b 100644 --- a/libs/langchain/langchain/vectorstores/supabase.py +++ b/libs/langchain/langchain/vectorstores/supabase.py @@ -199,18 +199,31 @@ class SupabaseVectorStore(VectorStore): ) def match_args( - self, query: List[float], k: int, filter: Optional[Dict[str, Any]] + self, query: List[float], filter: Optional[Dict[str, Any]] ) -> Dict[str, Any]: - ret = dict(query_embedding=query, match_count=k) + ret: Dict[str, Any] = dict(query_embedding=query) if filter: ret["filter"] = filter return ret def similarity_search_by_vector_with_relevance_scores( - self, query: List[float], k: int, filter: Optional[Dict[str, Any]] = None + self, + query: List[float], + k: int, + filter: Optional[Dict[str, Any]] = None, + postgrest_filter: Optional[str] = None, ) -> List[Tuple[Document, float]]: - match_documents_params = self.match_args(query, k, filter) - res = self._client.rpc(self.query_name, match_documents_params).execute() + match_documents_params = self.match_args(query, filter) + query_builder = self._client.rpc(self.query_name, match_documents_params) + + if postgrest_filter: + query_builder.params = query_builder.params.set( + "and", f"({postgrest_filter})" + ) + + query_builder.params = query_builder.params.set("limit", k) + + res = query_builder.execute() match_result = [ ( @@ -227,10 +240,23 @@ class SupabaseVectorStore(VectorStore): return match_result def similarity_search_by_vector_returning_embeddings( - self, query: List[float], k: int, filter: Optional[Dict[str, Any]] = None + self, + query: List[float], + k: int, + filter: Optional[Dict[str, Any]] = None, + postgrest_filter: Optional[str] = None, ) -> List[Tuple[Document, float, np.ndarray[np.float32, Any]]]: - match_documents_params = self.match_args(query, k, filter) - res = self._client.rpc(self.query_name, match_documents_params).execute() + match_documents_params = self.match_args(query, filter) + query_builder = self._client.rpc(self.query_name, match_documents_params) + + if postgrest_filter: + query_builder.params = query_builder.params.set( + "and", f"({postgrest_filter})" + ) + + query_builder.params = query_builder.params.set("limit", k) + + res = query_builder.execute() match_result = [ ( diff --git a/libs/langchain/tests/unit_tests/retrievers/self_query/test_supabase.py b/libs/langchain/tests/unit_tests/retrievers/self_query/test_supabase.py new file mode 100644 index 0000000000..de9b04fabf --- /dev/null +++ b/libs/langchain/tests/unit_tests/retrievers/self_query/test_supabase.py @@ -0,0 +1,85 @@ +from typing import Dict, Tuple + +from langchain.chains.query_constructor.ir import ( + Comparator, + Comparison, + Operation, + Operator, + StructuredQuery, +) +from langchain.retrievers.self_query.supabase import SupabaseVectorTranslator + +DEFAULT_TRANSLATOR = SupabaseVectorTranslator() + + +def test_visit_comparison() -> None: + comp = Comparison(comparator=Comparator.LT, attribute="foo", value=["1", "2"]) + expected = "and(metadata->>foo.lt.1,metadata->>foo.lt.2)" + actual = DEFAULT_TRANSLATOR.visit_comparison(comp) + assert expected == actual + + +def test_visit_operation() -> None: + op = Operation( + operator=Operator.AND, + arguments=[ + Comparison(comparator=Comparator.LT, attribute="foo", value=2), + Comparison(comparator=Comparator.EQ, attribute="bar", value="baz"), + Comparison(comparator=Comparator.LT, attribute="abc", value=["1", "2"]), + ], + ) + expected = ( + "and(" + "metadata->foo.lt.2," + "metadata->>bar.eq.baz," + "and(metadata->>abc.lt.1,metadata->>abc.lt.2)" + ")" + ) + actual = DEFAULT_TRANSLATOR.visit_operation(op) + assert expected == actual + + +def test_visit_structured_query() -> None: + query = "What is the capital of France?" + structured_query = StructuredQuery( + query=query, + filter=None, + ) + expected: Tuple[str, Dict] = (query, {}) + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual + + comp = Comparison(comparator=Comparator.LT, attribute="foo", value=["1", "2"]) + expected = ( + query, + {"postgrest_filter": "and(metadata->>foo.lt.1,metadata->>foo.lt.2)"}, + ) + structured_query = StructuredQuery( + query=query, + filter=comp, + ) + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual + + op = Operation( + operator=Operator.AND, + arguments=[ + Comparison(comparator=Comparator.LT, attribute="foo", value=2), + Comparison(comparator=Comparator.EQ, attribute="bar", value="baz"), + Comparison(comparator=Comparator.LT, attribute="abc", value=["1", "2"]), + ], + ) + structured_query = StructuredQuery( + query=query, + filter=op, + ) + expected = ( + query, + { + "postgrest_filter": ( + "and(metadata->foo.lt.2,metadata->>bar.eq.baz,and(metadata->>abc.lt.1,metadata->>abc.lt.2))" + ) + }, + ) + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual From 1b3ea1eeb4c8e9e50257b02fda1654027143234d Mon Sep 17 00:00:00 2001 From: Leonid Ganeline Date: Thu, 7 Sep 2023 19:35:34 -0700 Subject: [PATCH 10/13] docstrings: `chat_loaders` (#10307) Updated docstrings. Made them consistent across the module. --- .../langchain/chat_loaders/__init__.py | 21 ++++++++++++++---- libs/langchain/langchain/chat_loaders/base.py | 9 +------- .../chat_loaders/facebook_messenger.py | 4 ++-- .../langchain/langchain/chat_loaders/gmail.py | 2 +- .../langchain/chat_loaders/imessage.py | 22 +++++++++---------- .../langchain/langchain/chat_loaders/slack.py | 2 ++ .../langchain/chat_loaders/telegram.py | 3 +-- .../langchain/chat_loaders/whatsapp.py | 2 ++ 8 files changed, 37 insertions(+), 28 deletions(-) diff --git a/libs/langchain/langchain/chat_loaders/__init__.py b/libs/langchain/langchain/chat_loaders/__init__.py index 594d87344d..7547ddcecc 100644 --- a/libs/langchain/langchain/chat_loaders/__init__.py +++ b/libs/langchain/langchain/chat_loaders/__init__.py @@ -1,6 +1,19 @@ -"""Load chat messages from common communications platforms for finetuning. +"""**Chat Loaders** load chat messages from common communications platforms. -This module provides functions to load chat messages from various +Load chat messages from various communications platforms such as Facebook Messenger, Telegram, and -WhatsApp. The loaded chat messages can be used for finetuning models. -""" +WhatsApp. The loaded chat messages can be used for fine-tuning models. + +**Class hierarchy:** + +.. code-block:: + + BaseChatLoader --> ChatLoader # Examples: WhatsAppChatLoader, IMessageChatLoader + +**Main helpers:** + +.. code-block:: + + ChatSession + +""" # noqa: E501 diff --git a/libs/langchain/langchain/chat_loaders/base.py b/libs/langchain/langchain/chat_loaders/base.py index 418ba15d2f..6e1f37ca9a 100644 --- a/libs/langchain/langchain/chat_loaders/base.py +++ b/libs/langchain/langchain/chat_loaders/base.py @@ -1,10 +1,3 @@ -"""Base definitions for chat loaders. - -A chat loader is a class that loads chat messages from an external -source such as a file or a database. The chat messages can then be -used for finetuning. -""" - from abc import ABC, abstractmethod from typing import Iterator, List, Sequence, TypedDict @@ -12,7 +5,7 @@ from langchain.schema.messages import BaseMessage class ChatSession(TypedDict): - """A chat session represents a single + """Chat Session represents a single conversation, channel, or other group of messages.""" messages: Sequence[BaseMessage] diff --git a/libs/langchain/langchain/chat_loaders/facebook_messenger.py b/libs/langchain/langchain/chat_loaders/facebook_messenger.py index 5864c32740..bfdc0155c7 100644 --- a/libs/langchain/langchain/chat_loaders/facebook_messenger.py +++ b/libs/langchain/langchain/chat_loaders/facebook_messenger.py @@ -10,7 +10,7 @@ logger = logging.getLogger(__file__) class SingleFileFacebookMessengerChatLoader(BaseChatLoader): - """A chat loader for loading Facebook Messenger chat data from a single file. + """Load `Facebook Messenger` chat data from a single file. Args: path (Union[Path, str]): The path to the chat file. @@ -45,7 +45,7 @@ class SingleFileFacebookMessengerChatLoader(BaseChatLoader): class FolderFacebookMessengerChatLoader(BaseChatLoader): - """A chat loader for loading Facebook Messenger chat data from a folder. + """Load `Facebook Messenger` chat data from a folder. Args: path (Union[str, Path]): The path to the directory diff --git a/libs/langchain/langchain/chat_loaders/gmail.py b/libs/langchain/langchain/chat_loaders/gmail.py index 4e88accdee..94a3c5617e 100644 --- a/libs/langchain/langchain/chat_loaders/gmail.py +++ b/libs/langchain/langchain/chat_loaders/gmail.py @@ -62,7 +62,7 @@ def _get_message_data(service: Any, message: Any) -> ChatSession: class GMailLoader(BaseChatLoader): - """This loader goes over how to load data from GMail. + """Load data from `GMail`. There are many ways you could want to load data from GMail. This loader is currently fairly opinionated in how to do so. diff --git a/libs/langchain/langchain/chat_loaders/imessage.py b/libs/langchain/langchain/chat_loaders/imessage.py index ff9a06142c..d6c02f1e53 100644 --- a/libs/langchain/langchain/chat_loaders/imessage.py +++ b/libs/langchain/langchain/chat_loaders/imessage.py @@ -1,14 +1,3 @@ -"""IMessage Chat Loader. - -This class is used to load chat sessions from the iMessage chat.db SQLite file. -It only works on macOS when you have iMessage enabled and have the chat.db file. - -The chat.db file is likely located at ~/Library/Messages/chat.db. However, your -terminal may not have permission to access this file. To resolve this, you can -copy the file to a different location, change the permissions of the file, or -grant full disk access for your terminal emulator in System Settings > Security -and Privacy > Full Disk Access. -""" from __future__ import annotations from pathlib import Path @@ -22,6 +11,17 @@ if TYPE_CHECKING: class IMessageChatLoader(chat_loaders.BaseChatLoader): + """Load chat sessions from the `iMessage` chat.db SQLite file. + + It only works on macOS when you have iMessage enabled and have the chat.db file. + + The chat.db file is likely located at ~/Library/Messages/chat.db. However, your + terminal may not have permission to access this file. To resolve this, you can + copy the file to a different location, change the permissions of the file, or + grant full disk access for your terminal emulator in System Settings > Security + and Privacy > Full Disk Access. + """ + def __init__(self, path: Optional[Union[str, Path]] = None): """ Initialize the IMessageChatLoader. diff --git a/libs/langchain/langchain/chat_loaders/slack.py b/libs/langchain/langchain/chat_loaders/slack.py index 261289bb43..0bbd503979 100644 --- a/libs/langchain/langchain/chat_loaders/slack.py +++ b/libs/langchain/langchain/chat_loaders/slack.py @@ -12,6 +12,8 @@ logger = logging.getLogger(__name__) class SlackChatLoader(chat_loaders.BaseChatLoader): + """Load `Slack` conversations from a dump zip file.""" + def __init__( self, path: Union[str, Path], diff --git a/libs/langchain/langchain/chat_loaders/telegram.py b/libs/langchain/langchain/chat_loaders/telegram.py index f55fd71476..5f0bbfa324 100644 --- a/libs/langchain/langchain/chat_loaders/telegram.py +++ b/libs/langchain/langchain/chat_loaders/telegram.py @@ -13,8 +13,7 @@ logger = logging.getLogger(__name__) class TelegramChatLoader(chat_loaders.BaseChatLoader): - """A loading utility for converting telegram conversations - to LangChain chat messages. + """Load `telegram` conversations to LangChain chat messages. To export, use the Telegram Desktop app from https://desktop.telegram.org/, select a conversation, click the three dots diff --git a/libs/langchain/langchain/chat_loaders/whatsapp.py b/libs/langchain/langchain/chat_loaders/whatsapp.py index c911e262c6..e2518ab44d 100644 --- a/libs/langchain/langchain/chat_loaders/whatsapp.py +++ b/libs/langchain/langchain/chat_loaders/whatsapp.py @@ -12,6 +12,8 @@ logger = logging.getLogger(__name__) class WhatsAppChatLoader(chat_loaders.BaseChatLoader): + """Load `WhatsApp` conversations from a dump zip file or directory.""" + def __init__(self, path: str): """Initialize the WhatsAppChatLoader. From fdba711d28375e86b23cfbad10a17feb67276ef5 Mon Sep 17 00:00:00 2001 From: Leonid Ganeline Date: Thu, 7 Sep 2023 19:53:33 -0700 Subject: [PATCH 11/13] docs `integrations/embeddings` consistency (#10302) Updated `integrations/embeddings`: fixed titles; added links, descriptions Updated `integrations/providers`. --- docs/docs_skeleton/vercel.json | 4 ++ docs/extras/integrations/providers/awadb.md | 15 +++-- .../integrations/providers/modelscope.mdx | 14 +++-- .../integrations/providers/nlpcloud.mdx | 26 +++++++-- docs/extras/integrations/providers/spacy.mdx | 8 +++ .../text_embedding/{Awa.ipynb => awadb.ipynb} | 8 ++- .../integrations/text_embedding/bedrock.ipynb | 6 +- .../text_embedding/bge_huggingface.ipynb | 33 ++++++++--- .../google_vertex_ai_palm.ipynb | 9 +-- .../text_embedding/modelscope_hub.ipynb | 20 +++++-- .../text_embedding/mosaicml.ipynb | 17 ++++-- .../text_embedding/nlp_cloud.ipynb | 6 +- .../text_embedding/sagemaker-endpoint.ipynb | 10 ++-- .../text_embedding/self-hosted.ipynb | 10 ++-- .../sentence_transformers.ipynb | 9 ++- .../text_embedding/spacy_embedding.ipynb | 55 +++++++++++++------ 16 files changed, 170 insertions(+), 80 deletions(-) rename docs/extras/integrations/text_embedding/{Awa.ipynb => awadb.ipynb} (89%) diff --git a/docs/docs_skeleton/vercel.json b/docs/docs_skeleton/vercel.json index 2f560db73a..47e08936b4 100644 --- a/docs/docs_skeleton/vercel.json +++ b/docs/docs_skeleton/vercel.json @@ -2216,6 +2216,10 @@ "source": "/docs/modules/data_connection/text_embedding/integrations/tensorflowhub", "destination": "/docs/integrations/text_embedding/tensorflowhub" }, + { + "source": "/docs/integrations/text_embedding/Awa", + "destination": "/docs/integrations/text_embedding/awadb" + }, { "source": "/en/latest/modules/indexes/vectorstores/examples/analyticdb.html", "destination": "/docs/integrations/vectorstores/analyticdb" diff --git a/docs/extras/integrations/providers/awadb.md b/docs/extras/integrations/providers/awadb.md index 7c2e9943f5..be6d4d66fe 100644 --- a/docs/extras/integrations/providers/awadb.md +++ b/docs/extras/integrations/providers/awadb.md @@ -9,13 +9,20 @@ pip install awadb ``` -## VectorStore +## Vector Store -There exists a wrapper around AwaDB vector databases, allowing you to use it as a vectorstore, -whether for semantic search or example selection. ```python from langchain.vectorstores import AwaDB ``` -For a more detailed walkthrough of the AwaDB wrapper, see [here](/docs/integrations/vectorstores/awadb.html). +See a [usage example](/docs/integrations/vectorstores/awadb). + + +## Text Embedding Model + +```python +from langchain.embeddings import AwaEmbeddings +``` + +See a [usage example](/docs/integrations/text_embedding/awadb). diff --git a/docs/extras/integrations/providers/modelscope.mdx b/docs/extras/integrations/providers/modelscope.mdx index c37c5f60c4..df6add2bb1 100644 --- a/docs/extras/integrations/providers/modelscope.mdx +++ b/docs/extras/integrations/providers/modelscope.mdx @@ -1,20 +1,24 @@ # ModelScope +>[ModelScope](https://www.modelscope.cn/home) is a big repository of the models and datasets. + This page covers how to use the modelscope ecosystem within LangChain. It is broken into two parts: installation and setup, and then references to specific modelscope wrappers. ## Installation and Setup -* Install the Python SDK with `pip install modelscope` +Install the `modelscope` package. + +```bash +pip install modelscope +``` -## Wrappers -### Embeddings +## Text Embedding Models -There exists a modelscope Embeddings wrapper, which you can access with ```python from langchain.embeddings import ModelScopeEmbeddings ``` -For a more detailed walkthrough of this, see [this notebook](/docs/integrations/text_embedding/modelscope_hub.html) +For a more detailed walkthrough of this, see [this notebook](/docs/integrations/text_embedding/modelscope_hub) diff --git a/docs/extras/integrations/providers/nlpcloud.mdx b/docs/extras/integrations/providers/nlpcloud.mdx index 050da5af04..e401faeb5a 100644 --- a/docs/extras/integrations/providers/nlpcloud.mdx +++ b/docs/extras/integrations/providers/nlpcloud.mdx @@ -1,17 +1,31 @@ # NLPCloud -This page covers how to use the NLPCloud ecosystem within LangChain. -It is broken into two parts: installation and setup, and then references to specific NLPCloud wrappers. +>[NLP Cloud](https://docs.nlpcloud.com/#introduction) is an artificial intelligence platform that allows you to use the most advanced AI engines, and even train your own engines with your own data. + ## Installation and Setup -- Install the Python SDK with `pip install nlpcloud` + +- Install the `nlpcloud` package. + +```bash +pip install nlpcloud +``` + - Get an NLPCloud api key and set it as an environment variable (`NLPCLOUD_API_KEY`) -## Wrappers -### LLM +## LLM + +See a [usage example](/docs/integrations/llms/nlpcloud). -There exists an NLPCloud LLM wrapper, which you can access with ```python from langchain.llms import NLPCloud ``` + +## Text Embedding Models + +See a [usage example](/docs/integrations/text_embedding/nlp_cloud) + +```python +from langchain.embeddings import NLPCloudEmbeddings +``` diff --git a/docs/extras/integrations/providers/spacy.mdx b/docs/extras/integrations/providers/spacy.mdx index f4d49497dd..ab9b685898 100644 --- a/docs/extras/integrations/providers/spacy.mdx +++ b/docs/extras/integrations/providers/spacy.mdx @@ -18,3 +18,11 @@ See a [usage example](/docs/modules/data_connection/document_transformers/text_s ```python from langchain.text_splitter import SpacyTextSplitter ``` + +## Text Embedding Models + +See a [usage example](/docs/integrations/text_embedding/spacy_embedding) + +```python +from langchain.embeddings.spacy_embeddings import SpacyEmbeddings +``` diff --git a/docs/extras/integrations/text_embedding/Awa.ipynb b/docs/extras/integrations/text_embedding/awadb.ipynb similarity index 89% rename from docs/extras/integrations/text_embedding/Awa.ipynb rename to docs/extras/integrations/text_embedding/awadb.ipynb index 1fb7ddca6f..f2c1e73392 100644 --- a/docs/extras/integrations/text_embedding/Awa.ipynb +++ b/docs/extras/integrations/text_embedding/awadb.ipynb @@ -5,9 +5,11 @@ "id": "b14a24db", "metadata": {}, "source": [ - "# AwaEmbedding\n", + "# AwaDB\n", "\n", - "This notebook explains how to use AwaEmbedding, which is included in [awadb](https://github.com/awa-ai/awadb), to embedding texts in langchain." + ">[AwaDB](https://github.com/awa-ai/awadb) is an AI Native database for the search and storage of embedding vectors used by LLM Applications.\n", + "\n", + "This notebook explains how to use `AwaEmbeddings` in LangChain." ] }, { @@ -101,7 +103,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/docs/extras/integrations/text_embedding/bedrock.ipynb b/docs/extras/integrations/text_embedding/bedrock.ipynb index 7c16cb8ead..0dbbcd080f 100644 --- a/docs/extras/integrations/text_embedding/bedrock.ipynb +++ b/docs/extras/integrations/text_embedding/bedrock.ipynb @@ -5,7 +5,9 @@ "id": "75e378f5-55d7-44b6-8e2e-6d7b8b171ec4", "metadata": {}, "source": [ - "# Bedrock Embeddings" + "# Bedrock\n", + "\n", + ">[Amazon Bedrock](https://aws.amazon.com/bedrock/) is a fully managed service that makes FMs from leading AI startups and Amazon available via an API, so you can choose from a wide range of FMs to find the model that is best suited for your use case.\n" ] }, { @@ -91,7 +93,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/docs/extras/integrations/text_embedding/bge_huggingface.ipynb b/docs/extras/integrations/text_embedding/bge_huggingface.ipynb index bcf196fc20..923ba92874 100644 --- a/docs/extras/integrations/text_embedding/bge_huggingface.ipynb +++ b/docs/extras/integrations/text_embedding/bge_huggingface.ipynb @@ -5,26 +5,29 @@ "id": "719619d3", "metadata": {}, "source": [ - "# BGE Hugging Face Embeddings\n", + "# BGE on Hugging Face\n", "\n", - "This notebook shows how to use BGE Embeddings through Hugging Face" + ">[BGE models on the HuggingFace](https://huggingface.co/BAAI/bge-large-en) are [the best open-source embedding models](https://huggingface.co/spaces/mteb/leaderboard).\n", + ">BGE model is created by the [Beijing Academy of Artificial Intelligence (BAAI)](https://www.baai.ac.cn/english.html). `BAAI` is a private non-profit organization engaged in AI research and development.\n", + "\n", + "This notebook shows how to use `BGE Embeddings` through `Hugging Face`" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "f7a54279", "metadata": { "scrolled": true }, "outputs": [], "source": [ - "# !pip install sentence_transformers" + "#!pip install sentence_transformers" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "9e1d5b6b", "metadata": {}, "outputs": [], @@ -43,12 +46,24 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "id": "e59d1a89", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "384" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "embedding = hf.embed_query(\"hi this is harrison\")" + "embedding = hf.embed_query(\"hi this is harrison\")\n", + "len(embedding)" ] }, { @@ -76,7 +91,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.1" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/docs/extras/integrations/text_embedding/google_vertex_ai_palm.ipynb b/docs/extras/integrations/text_embedding/google_vertex_ai_palm.ipynb index ea607467fb..4c0c515e80 100644 --- a/docs/extras/integrations/text_embedding/google_vertex_ai_palm.ipynb +++ b/docs/extras/integrations/text_embedding/google_vertex_ai_palm.ipynb @@ -1,13 +1,14 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "# Google Cloud Platform Vertex AI PaLM \n", + "# Google Vertex AI PaLM \n", "\n", - "Note: This is seperate from the Google PaLM integration, it exposes [Vertex AI PaLM API](https://cloud.google.com/vertex-ai/docs/generative-ai/learn/overview) on Google Cloud. \n", + ">[Vertex AI PaLM API](https://cloud.google.com/vertex-ai/docs/generative-ai/learn/overview) is a service on Google Cloud exposing the embedding models. \n", + "\n", + "Note: This integration is seperate from the Google PaLM integration.\n", "\n", "By default, Google Cloud [does not use](https://cloud.google.com/vertex-ai/docs/generative-ai/data-governance#foundation_model_development) Customer Data to train its foundation models as part of Google Cloud`s AI/ML Privacy Commitment. More details about how Google processes data can also be found in [Google's Customer Data Processing Addendum (CDPA)](https://cloud.google.com/terms/data-processing-addendum).\n", "\n", @@ -96,7 +97,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.12" }, "vscode": { "interpreter": { diff --git a/docs/extras/integrations/text_embedding/modelscope_hub.ipynb b/docs/extras/integrations/text_embedding/modelscope_hub.ipynb index 765d46769c..e2f47c4f3a 100644 --- a/docs/extras/integrations/text_embedding/modelscope_hub.ipynb +++ b/docs/extras/integrations/text_embedding/modelscope_hub.ipynb @@ -1,12 +1,13 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# ModelScope\n", "\n", + ">[ModelScope](https://www.modelscope.cn/home) is big repository of the models and datasets.\n", + "\n", "Let's load the ModelScope Embedding class." ] }, @@ -67,16 +68,23 @@ ], "metadata": { "kernelspec": { - "display_name": "chatgpt", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "3.9.15" - }, - "orig_nbformat": 4 + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/docs/extras/integrations/text_embedding/mosaicml.ipynb b/docs/extras/integrations/text_embedding/mosaicml.ipynb index 2d91c8d9c5..24d7aecb72 100644 --- a/docs/extras/integrations/text_embedding/mosaicml.ipynb +++ b/docs/extras/integrations/text_embedding/mosaicml.ipynb @@ -1,15 +1,14 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "# MosaicML embeddings\n", + "# MosaicML\n", "\n", - "[MosaicML](https://docs.mosaicml.com/en/latest/inference.html) offers a managed inference service. You can either use a variety of open source models, or deploy your own.\n", + ">[MosaicML](https://docs.mosaicml.com/en/latest/inference.html) offers a managed inference service. You can either use a variety of open source models, or deploy your own.\n", "\n", - "This example goes over how to use LangChain to interact with MosaicML Inference for text embedding." + "This example goes over how to use LangChain to interact with `MosaicML` Inference for text embedding." ] }, { @@ -94,6 +93,11 @@ } ], "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -103,9 +107,10 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/docs/extras/integrations/text_embedding/nlp_cloud.ipynb b/docs/extras/integrations/text_embedding/nlp_cloud.ipynb index 73ae71fe0f..9567d59c4b 100644 --- a/docs/extras/integrations/text_embedding/nlp_cloud.ipynb +++ b/docs/extras/integrations/text_embedding/nlp_cloud.ipynb @@ -7,7 +7,7 @@ "source": [ "# NLP Cloud\n", "\n", - "NLP Cloud is an artificial intelligence platform that allows you to use the most advanced AI engines, and even train your own engines with your own data. \n", + ">[NLP Cloud](https://docs.nlpcloud.com/#introduction) is an artificial intelligence platform that allows you to use the most advanced AI engines, and even train your own engines with your own data. \n", "\n", "The [embeddings](https://docs.nlpcloud.com/#embeddings) endpoint offers the following model:\n", "\n", @@ -80,7 +80,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.11.2 64-bit", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -94,7 +94,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.2" + "version": "3.10.12" }, "vscode": { "interpreter": { diff --git a/docs/extras/integrations/text_embedding/sagemaker-endpoint.ipynb b/docs/extras/integrations/text_embedding/sagemaker-endpoint.ipynb index fe5299ae6f..ec80112e10 100644 --- a/docs/extras/integrations/text_embedding/sagemaker-endpoint.ipynb +++ b/docs/extras/integrations/text_embedding/sagemaker-endpoint.ipynb @@ -5,11 +5,13 @@ "id": "1f83f273", "metadata": {}, "source": [ - "# SageMaker Endpoint Embeddings\n", + "# SageMaker\n", "\n", - "Let's load the SageMaker Endpoints Embeddings class. The class can be used if you host, e.g. your own Hugging Face model on SageMaker.\n", + "Let's load the `SageMaker Endpoints Embeddings` class. The class can be used if you host, e.g. your own Hugging Face model on SageMaker.\n", "\n", - "For instructions on how to do this, please see [here](https://www.philschmid.de/custom-inference-huggingface-sagemaker). **Note**: In order to handle batched requests, you will need to adjust the return line in the `predict_fn()` function within the custom `inference.py` script:\n", + "For instructions on how to do this, please see [here](https://www.philschmid.de/custom-inference-huggingface-sagemaker). \n", + "\n", + "**Note**: In order to handle batched requests, you will need to adjust the return line in the `predict_fn()` function within the custom `inference.py` script:\n", "\n", "Change from\n", "\n", @@ -143,7 +145,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.12" }, "vscode": { "interpreter": { diff --git a/docs/extras/integrations/text_embedding/self-hosted.ipynb b/docs/extras/integrations/text_embedding/self-hosted.ipynb index 00c497220e..47faa6bf2d 100644 --- a/docs/extras/integrations/text_embedding/self-hosted.ipynb +++ b/docs/extras/integrations/text_embedding/self-hosted.ipynb @@ -5,8 +5,8 @@ "id": "eec4efda", "metadata": {}, "source": [ - "# Self Hosted Embeddings\n", - "Let's load the SelfHostedEmbeddings, SelfHostedHuggingFaceEmbeddings, and SelfHostedHuggingFaceInstructEmbeddings classes." + "# Self Hosted\n", + "Let's load the `SelfHostedEmbeddings`, `SelfHostedHuggingFaceEmbeddings`, and `SelfHostedHuggingFaceInstructEmbeddings` classes." ] }, { @@ -149,9 +149,7 @@ "cell_type": "code", "execution_count": null, "id": "fc1bfd0f", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "query_result = embeddings.embed_query(text)" @@ -182,7 +180,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.12" }, "vscode": { "interpreter": { diff --git a/docs/extras/integrations/text_embedding/sentence_transformers.ipynb b/docs/extras/integrations/text_embedding/sentence_transformers.ipynb index 67eb83ab7c..e4649e6b71 100644 --- a/docs/extras/integrations/text_embedding/sentence_transformers.ipynb +++ b/docs/extras/integrations/text_embedding/sentence_transformers.ipynb @@ -1,16 +1,15 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "ed47bb62", "metadata": {}, "source": [ - "# Sentence Transformers Embeddings\n", + "# Sentence Transformers\n", "\n", - "[SentenceTransformers](https://www.sbert.net/) embeddings are called using the `HuggingFaceEmbeddings` integration. We have also added an alias for `SentenceTransformerEmbeddings` for users who are more familiar with directly using that package.\n", + ">[SentenceTransformers](https://www.sbert.net/) embeddings are called using the `HuggingFaceEmbeddings` integration. We have also added an alias for `SentenceTransformerEmbeddings` for users who are more familiar with directly using that package.\n", "\n", - "SentenceTransformers is a python package that can generate text and image embeddings, originating from [Sentence-BERT](https://arxiv.org/abs/1908.10084)" + "`SentenceTransformers` is a python package that can generate text and image embeddings, originating from [Sentence-BERT](https://arxiv.org/abs/1908.10084)" ] }, { @@ -109,7 +108,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.10.12" }, "vscode": { "interpreter": { diff --git a/docs/extras/integrations/text_embedding/spacy_embedding.ipynb b/docs/extras/integrations/text_embedding/spacy_embedding.ipynb index bfea82d5d4..edda4828b4 100644 --- a/docs/extras/integrations/text_embedding/spacy_embedding.ipynb +++ b/docs/extras/integrations/text_embedding/spacy_embedding.ipynb @@ -1,21 +1,31 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "# Spacy Embedding\n", + "# SpaCy\n", "\n", - "### Loading the Spacy embedding class to generate and query embeddings" + ">[spaCy](https://spacy.io/) is an open-source software library for advanced natural language processing, written in the programming languages Python and Cython.\n", + " \n", + "\n", + "## Installation and Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install spacy" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "#### Import the necessary classes" + "Import the necessary classes" ] }, { @@ -28,11 +38,12 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "#### Initialize SpacyEmbeddings.This will load the Spacy model into memory." + "## Example\n", + "\n", + "Initialize SpacyEmbeddings.This will load the Spacy model into memory." ] }, { @@ -45,11 +56,10 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "#### Define some example texts . These could be any documents that you want to analyze - for example, news articles, social media posts, or product reviews." + "Define some example texts . These could be any documents that you want to analyze - for example, news articles, social media posts, or product reviews." ] }, { @@ -67,11 +77,10 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "#### Generate and print embeddings for the texts . The SpacyEmbeddings class generates an embedding for each document, which is a numerical representation of the document's content. These embeddings can be used for various natural language processing tasks, such as document similarity comparison or text classification." + "Generate and print embeddings for the texts . The SpacyEmbeddings class generates an embedding for each document, which is a numerical representation of the document's content. These embeddings can be used for various natural language processing tasks, such as document similarity comparison or text classification." ] }, { @@ -86,11 +95,10 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "#### Generate and print an embedding for a single piece of text. You can also generate an embedding for a single piece of text, such as a search query. This can be useful for tasks like information retrieval, where you want to find documents that are similar to a given query." + "Generate and print an embedding for a single piece of text. You can also generate an embedding for a single piece of text, such as a search query. This can be useful for tasks like information retrieval, where you want to find documents that are similar to a given query." ] }, { @@ -106,11 +114,24 @@ } ], "metadata": { - "language_info": { - "name": "python" + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "orig_nbformat": 4 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } From 28de8d132c8c4f7ecfe246c61375d91a04ff0abf Mon Sep 17 00:00:00 2001 From: stopdropandrew Date: Thu, 7 Sep 2023 19:54:53 -0700 Subject: [PATCH 12/13] Change StructuredTool's ainvoke to await (#10300) Fixes #10080. StructuredTool's `ainvoke` doesn't `await`. --- libs/langchain/langchain/tools/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/langchain/langchain/tools/base.py b/libs/langchain/langchain/tools/base.py index 9ad81033d5..69597cd903 100644 --- a/libs/langchain/langchain/tools/base.py +++ b/libs/langchain/langchain/tools/base.py @@ -592,7 +592,7 @@ class StructuredTool(BaseTool): None, partial(self.invoke, input, config, **kwargs) ) - return super().ainvoke(input, config, **kwargs) + return await super().ainvoke(input, config, **kwargs) # --- Tool --- From 01e9d7902d4bdd0de88bc8d63c96d8d8894b8c4c Mon Sep 17 00:00:00 2001 From: C Mazzoni Date: Fri, 8 Sep 2023 01:04:55 -0400 Subject: [PATCH 13/13] Update tool.py (#10203) Fixed the description of tool QuerySQLCheckerTool, the last line of the string description had the old name of the tool 'sql_db_query', this caused the models to sometimes call the non-existent tool The issue was not numerically identified. No dependencies --- libs/langchain/langchain/tools/sql_database/tool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/langchain/langchain/tools/sql_database/tool.py b/libs/langchain/langchain/tools/sql_database/tool.py index f60275bcaf..75f45c7b9e 100644 --- a/libs/langchain/langchain/tools/sql_database/tool.py +++ b/libs/langchain/langchain/tools/sql_database/tool.py @@ -93,7 +93,7 @@ class QuerySQLCheckerTool(BaseSQLDatabaseTool, BaseTool): name: str = "sql_db_query_checker" description: str = """ Use this tool to double check if your query is correct before executing it. - Always use this tool before executing a query with query_sql_db! + Always use this tool before executing a query with sql_db_query! """ @root_validator(pre=True)