diff --git a/.github/actions/poetry_setup/action.yml b/.github/actions/poetry_setup/action.yml index 9d7ce548de..d1342465c3 100644 --- a/.github/actions/poetry_setup/action.yml +++ b/.github/actions/poetry_setup/action.yml @@ -31,20 +31,45 @@ runs: with: python-version: ${{ inputs.python-version }} - # - uses: actions/cache@v3 - # id: cache-bin-poetry - # name: Cache Poetry binary - Python ${{ inputs.python-version }} - # env: - # SEGMENT_DOWNLOAD_TIMEOUT_MIN: "1" - # with: - # path: | - # /opt/pipx/venvs/poetry - # /opt/pipx_bin/poetry - # # This step caches the poetry installation, so make sure it's keyed on the poetry version as well. - # key: bin-poetry-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }}-${{ inputs.poetry-version }} + - uses: actions/cache@v3 + id: cache-bin-poetry + name: Cache Poetry binary - Python ${{ inputs.python-version }} + env: + SEGMENT_DOWNLOAD_TIMEOUT_MIN: "1" + with: + path: | + /opt/pipx/venvs/poetry + # This step caches the poetry installation, so make sure it's keyed on the poetry version as well. + key: bin-poetry-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }}-${{ inputs.poetry-version }} + + - name: Refresh shell hashtable and fixup softlinks + if: steps.cache-bin-poetry.outputs.cache-hit == 'true' + shell: bash + env: + POETRY_VERSION: ${{ inputs.poetry-version }} + PYTHON_VERSION: ${{ inputs.python-version }} + run: | + set -eux + + # Refresh the shell hashtable, to ensure correct `which` output. + hash -r + + # `actions/cache@v3` doesn't always seem able to correctly unpack softlinks. + # Delete and recreate the softlinks pipx expects to have. + rm /opt/pipx/venvs/poetry/bin/python + cd /opt/pipx/venvs/poetry/bin + ln -s "$(which "python$PYTHON_VERSION")" python + chmod +x python + cd /opt/pipx_bin/ + ln -s /opt/pipx/venvs/poetry/bin/poetry poetry + chmod +x poetry + + # Ensure everything got set up correctly. + /opt/pipx/venvs/poetry/bin/python --version + /opt/pipx_bin/poetry --version - name: Install poetry - # if: steps.cache-bin-poetry.outputs.cache-hit != 'true' + if: steps.cache-bin-poetry.outputs.cache-hit != 'true' shell: bash env: POETRY_VERSION: ${{ inputs.poetry-version }} diff --git a/.github/workflows/_lint.yml b/.github/workflows/_lint.yml index 1a01b225a0..64169ce0be 100644 --- a/.github/workflows/_lint.yml +++ b/.github/workflows/_lint.yml @@ -87,7 +87,7 @@ jobs: python-version: ${{ matrix.python-version }} poetry-version: ${{ env.POETRY_VERSION }} working-directory: ${{ inputs.working-directory }} - cache-key: lint + cache-key: lint-with-extras - name: Check Poetry File shell: bash @@ -102,9 +102,17 @@ jobs: poetry lock --check - name: Install dependencies + # Also installs dev/lint/test/typing dependencies, to ensure we have + # type hints for as many of our libraries as possible. + # This helps catch errors that require dependencies to be spotted, for example: + # https://github.com/langchain-ai/langchain/pull/10249/files#diff-935185cd488d015f026dcd9e19616ff62863e8cde8c0bee70318d3ccbca98341 + # + # If you change this configuration, make sure to change the `cache-key` + # in the `poetry_setup` action above to stop using the old cache. + # It doesn't matter how you change it, any change will cause a cache-bust. working-directory: ${{ inputs.working-directory }} run: | - poetry install + poetry install --with dev,lint,test,typing - name: Install langchain editable working-directory: ${{ inputs.working-directory }} diff --git a/.github/workflows/_pydantic_compatibility.yml b/.github/workflows/_pydantic_compatibility.yml index 7d8fe26d92..94d362f327 100644 --- a/.github/workflows/_pydantic_compatibility.yml +++ b/.github/workflows/_pydantic_compatibility.yml @@ -79,3 +79,15 @@ jobs: - name: Run pydantic compatibility tests shell: bash run: make test + + - name: Ensure the tests did not create any additional files + shell: bash + run: | + set -eu + + STATUS="$(git status)" + echo "$STATUS" + + # grep will exit non-zero if the target message isn't found, + # and `set -e` above will cause the step to fail. + echo "$STATUS" | grep 'nothing to commit, working tree clean' diff --git a/.github/workflows/_test.yml b/.github/workflows/_test.yml index 76d86a2862..04be6a2c39 100644 --- a/.github/workflows/_test.yml +++ b/.github/workflows/_test.yml @@ -43,3 +43,15 @@ jobs: - name: Run core tests shell: bash run: make test + + - name: Ensure the tests did not create any additional files + shell: bash + run: | + set -eu + + STATUS="$(git status)" + echo "$STATUS" + + # grep will exit non-zero if the target message isn't found, + # and `set -e` above will cause the step to fail. + echo "$STATUS" | grep 'nothing to commit, working tree clean' diff --git a/.github/workflows/langchain_ci.yml b/.github/workflows/langchain_ci.yml index 8f1fc5d874..f184af9772 100644 --- a/.github/workflows/langchain_ci.yml +++ b/.github/workflows/langchain_ci.yml @@ -6,6 +6,8 @@ on: branches: [ master ] pull_request: paths: + - '.github/actions/poetry_setup/action.yml' + - '.github/tools/**' - '.github/workflows/_lint.yml' - '.github/workflows/_test.yml' - '.github/workflows/_pydantic_compatibility.yml' @@ -81,3 +83,15 @@ jobs: - name: Run extended tests run: make extended_tests + + - name: Ensure the tests did not create any additional files + shell: bash + run: | + set -eu + + STATUS="$(git status)" + echo "$STATUS" + + # grep will exit non-zero if the target message isn't found, + # and `set -e` above will cause the step to fail. + echo "$STATUS" | grep 'nothing to commit, working tree clean' diff --git a/.github/workflows/langchain_experimental_ci.yml b/.github/workflows/langchain_experimental_ci.yml index 5b00365f82..c4c4a039c1 100644 --- a/.github/workflows/langchain_experimental_ci.yml +++ b/.github/workflows/langchain_experimental_ci.yml @@ -6,6 +6,8 @@ on: branches: [ master ] pull_request: paths: + - '.github/actions/poetry_setup/action.yml' + - '.github/tools/**' - '.github/workflows/_lint.yml' - '.github/workflows/_test.yml' - '.github/workflows/langchain_experimental_ci.yml' @@ -113,3 +115,15 @@ jobs: - name: Run extended tests run: make extended_tests + + - name: Ensure the tests did not create any additional files + shell: bash + run: | + set -eu + + STATUS="$(git status)" + echo "$STATUS" + + # grep will exit non-zero if the target message isn't found, + # and `set -e` above will cause the step to fail. + echo "$STATUS" | grep 'nothing to commit, working tree clean' diff --git a/.github/workflows/scheduled_test.yml b/.github/workflows/scheduled_test.yml index b71eee0592..7ce59d5b69 100644 --- a/.github/workflows/scheduled_test.yml +++ b/.github/workflows/scheduled_test.yml @@ -47,3 +47,15 @@ jobs: OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} run: | make scheduled_tests + + - name: Ensure the tests did not create any additional files + shell: bash + run: | + set -eu + + STATUS="$(git status)" + echo "$STATUS" + + # grep will exit non-zero if the target message isn't found, + # and `set -e` above will cause the step to fail. + echo "$STATUS" | grep 'nothing to commit, working tree clean' diff --git a/docs/api_reference/guide_imports.json b/docs/api_reference/guide_imports.json index f35c7805a0..8e4d0fed32 100644 --- a/docs/api_reference/guide_imports.json +++ b/docs/api_reference/guide_imports.json @@ -317,7 +317,7 @@ "Chatbots": "https://python.langchain.com/docs/use_cases/chatbots", "Summarization": "https://python.langchain.com/docs/use_cases/summarization", "Extraction": "https://python.langchain.com/docs/use_cases/extraction", - "SQL": "https://python.langchain.com/docs/use_cases/sql", + "SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql", "Tagging": "https://python.langchain.com/docs/use_cases/tagging", "Code Understanding": "https://python.langchain.com/docs/use_cases/code_understanding", "AutoGPT": "https://python.langchain.com/docs/use_cases/autonomous_agents/autogpt", @@ -400,7 +400,7 @@ "Summarization": "https://python.langchain.com/docs/use_cases/summarization", "Extraction": "https://python.langchain.com/docs/use_cases/extraction", "Interacting with APIs": "https://python.langchain.com/docs/use_cases/apis", - "SQL": "https://python.langchain.com/docs/use_cases/sql", + "SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql", "QA over Documents": "https://python.langchain.com/docs/use_cases/question_answering/index", "Retrieve from vector stores directly": "https://python.langchain.com/docs/use_cases/question_answering/how_to/vector_db_text_generation", "Improve document indexing with HyDE": "https://python.langchain.com/docs/use_cases/question_answering/how_to/hyde", @@ -641,7 +641,7 @@ "Chatbots": "https://python.langchain.com/docs/use_cases/chatbots", "Extraction": "https://python.langchain.com/docs/use_cases/extraction", "Interacting with APIs": "https://python.langchain.com/docs/use_cases/apis", - "SQL": "https://python.langchain.com/docs/use_cases/sql", + "SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql", "HuggingGPT": "https://python.langchain.com/docs/use_cases/autonomous_agents/hugginggpt", "Perform context-aware text splitting": "https://python.langchain.com/docs/use_cases/question_answering/how_to/document-context-aware-QA", "Retrieve from vector stores directly": "https://python.langchain.com/docs/use_cases/question_answering/how_to/vector_db_text_generation", @@ -1009,7 +1009,7 @@ "LangSmith Walkthrough": "https://python.langchain.com/docs/guides/langsmith/walkthrough", "Comparing Chain Outputs": "https://python.langchain.com/docs/guides/evaluation/examples/comparisons", "Agent Trajectory": "https://python.langchain.com/docs/guides/evaluation/trajectory/trajectory_eval", - "SQL": "https://python.langchain.com/docs/use_cases/sql", + "SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql", "Multi-modal outputs: Image & Text": "https://python.langchain.com/docs/use_cases/multi_modal/image_agent", "Agent Debates with Tools": "https://python.langchain.com/docs/use_cases/agent_simulations/two_agent_debate_tools", "Multiple callback handlers": "https://python.langchain.com/docs/modules/callbacks/multiple_callbacks", @@ -1268,7 +1268,7 @@ "SQL Database Agent": "https://python.langchain.com/docs/integrations/toolkits/sql_database", "JSON Agent": "https://python.langchain.com/docs/integrations/toolkits/json", "NIBittensorLLM": "https://python.langchain.com/docs/integrations/llms/bittensor", - "SQL": "https://python.langchain.com/docs/use_cases/sql", + "SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql", "BabyAGI with Tools": "https://python.langchain.com/docs/use_cases/agents/baby_agi_with_agent", "Conversational Retrieval Agent": "https://python.langchain.com/docs/use_cases/question_answering/how_to/conversational_retrieval_agents", "Plug-and-Plai": "https://python.langchain.com/docs/use_cases/agents/custom_agent_with_plugin_retrieval_using_plugnplai", @@ -1832,12 +1832,12 @@ "create_sql_agent": { "CnosDB": "https://python.langchain.com/docs/integrations/providers/cnosdb", "SQL Database Agent": "https://python.langchain.com/docs/integrations/toolkits/sql_database", - "SQL": "https://python.langchain.com/docs/use_cases/sql" + "SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql" }, "SQLDatabaseToolkit": { "CnosDB": "https://python.langchain.com/docs/integrations/providers/cnosdb", "SQL Database Agent": "https://python.langchain.com/docs/integrations/toolkits/sql_database", - "SQL": "https://python.langchain.com/docs/use_cases/sql", + "SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql", "Use ToolKits with OpenAI Functions": "https://python.langchain.com/docs/modules/agents/how_to/use_toolkits_with_openai_functions" }, "SageMakerCallbackHandler": { @@ -1899,7 +1899,7 @@ "Rebuff": "https://python.langchain.com/docs/integrations/providers/rebuff", "SQL Database Agent": "https://python.langchain.com/docs/integrations/toolkits/sql_database", "Cookbook": "https://python.langchain.com/docs/guides/expression_language/cookbook", - "SQL": "https://python.langchain.com/docs/use_cases/sql", + "SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql", "Multiple Retrieval Sources": "https://python.langchain.com/docs/use_cases/question_answering/how_to/multiple_retrieval" }, "Weaviate": { @@ -3035,11 +3035,11 @@ "Interacting with APIs": "https://python.langchain.com/docs/use_cases/apis" }, "create_sql_query_chain": { - "SQL": "https://python.langchain.com/docs/use_cases/sql", + "SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql", "Multiple Retrieval Sources": "https://python.langchain.com/docs/use_cases/question_answering/how_to/multiple_retrieval" }, "ElasticsearchDatabaseChain": { - "SQL": "https://python.langchain.com/docs/use_cases/sql" + "SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql" }, "FileChatMessageHistory": { "AutoGPT": "https://python.langchain.com/docs/use_cases/autonomous_agents/autogpt" diff --git a/docs/docs_skeleton/docs/modules/model_io/output_parsers/index.mdx b/docs/docs_skeleton/docs/modules/model_io/output_parsers/index.mdx index bfb4d7241a..a46031ffdd 100644 --- a/docs/docs_skeleton/docs/modules/model_io/output_parsers/index.mdx +++ b/docs/docs_skeleton/docs/modules/model_io/output_parsers/index.mdx @@ -12,7 +12,7 @@ Output parsers are classes that help structure language model responses. There a And then one optional one: -- "Parse with prompt": A method which takes in a string (assumed to be the response from a language model) and a prompt (assumed to the prompt that generated such a response) and parses it into some structure. The prompt is largely provided in the event the OutputParser wants to retry or fix the output in some way, and needs information from the prompt to do so. +- "Parse with prompt": A method which takes in a string (assumed to be the response from a language model) and a prompt (assumed to be the prompt that generated such a response) and parses it into some structure. The prompt is largely provided in the event the OutputParser wants to retry or fix the output in some way, and needs information from the prompt to do so. ## Get started diff --git a/docs/docs_skeleton/docs/use_cases/question_answering/_category_.yml b/docs/docs_skeleton/docs/use_cases/question_answering/_category_.yml new file mode 100644 index 0000000000..75252fdc39 --- /dev/null +++ b/docs/docs_skeleton/docs/use_cases/question_answering/_category_.yml @@ -0,0 +1,2 @@ +position: 0 +collapsed: false diff --git a/docs/docs_skeleton/docs/use_cases/web_scraping/index.mdx b/docs/docs_skeleton/docs/use_cases/web_scraping/index.mdx deleted file mode 100644 index ce28ca3839..0000000000 --- a/docs/docs_skeleton/docs/use_cases/web_scraping/index.mdx +++ /dev/null @@ -1,9 +0,0 @@ ---- -sidebar_position: 3 ---- - -# Web Scraping - -Web scraping has historically been a challenging endeavor due to the ever-changing nature of website structures, making it tedious for developers to maintain their scraping scripts. Traditional methods often rely on specific HTML tags and patterns which, when altered, can disrupt data extraction processes. - -Enter the LLM-based method for parsing HTML: By leveraging the capabilities of LLMs, and especially OpenAI Functions in LangChain's extraction chain, developers can instruct the model to extract only the desired data in a specified format. This method not only streamlines the extraction process but also significantly reduces the time spent on manual debugging and script modifications. Its adaptability means that even if websites undergo significant design changes, the extraction remains consistent and robust. This level of resilience translates to reduced maintenance efforts, cost savings, and ensures a higher quality of extracted data. Compared to its predecessors, the LLM-based approach wins out in the web scraping domain by transforming a historically cumbersome task into a more automated and efficient process. diff --git a/docs/docs_skeleton/vercel.json b/docs/docs_skeleton/vercel.json index dfa378952a..0c7625af99 100644 --- a/docs/docs_skeleton/vercel.json +++ b/docs/docs_skeleton/vercel.json @@ -1076,6 +1076,10 @@ "source": "/docs/modules/agents/tools/integrations/zapier", "destination": "/docs/integrations/tools/zapier" }, + { + "source": "/docs/integrations/tools/sqlite", + "destination": "/docs/use_cases/sql/sqlite" + }, { "source": "/en/latest/modules/callbacks/filecallbackhandler.html", "destination": "/docs/modules/callbacks/how_to/filecallbackhandler" @@ -2216,6 +2220,10 @@ "source": "/docs/modules/data_connection/text_embedding/integrations/tensorflowhub", "destination": "/docs/integrations/text_embedding/tensorflowhub" }, + { + "source": "/docs/integrations/text_embedding/Awa", + "destination": "/docs/integrations/text_embedding/awadb" + }, { "source": "/en/latest/modules/indexes/vectorstores/examples/analyticdb.html", "destination": "/docs/integrations/vectorstores/analyticdb" @@ -3178,7 +3186,11 @@ }, { "source": "/en/latest/use_cases/tabular.html", - "destination": "/docs/use_cases/tabular" + "destination": "/docs/use_cases/qa_structured" + }, + { + "source": "/docs/use_cases/sql(/?)", + "destination": "/docs/use_cases/qa_structured/sql" }, { "source": "/en/latest/youtube.html", @@ -3370,7 +3382,7 @@ }, { "source": "/docs/modules/chains/popular/sqlite", - "destination": "/docs/use_cases/tabular/sqlite" + "destination": "/docs/use_cases/qa_structured/sql" }, { "source": "/docs/modules/chains/popular/openai_functions", @@ -3582,7 +3594,7 @@ }, { "source": "/docs/modules/chains/additional/elasticsearch_database", - "destination": "/docs/use_cases/tabular/elasticsearch_database" + "destination": "/docs/use_cases/qa_structured/integrations/elasticsearch" }, { "source": "/docs/modules/chains/additional/tagging", diff --git a/docs/extras/additional_resources/youtube.mdx b/docs/extras/additional_resources/youtube.mdx index fc266bf48b..78da30e453 100644 --- a/docs/extras/additional_resources/youtube.mdx +++ b/docs/extras/additional_resources/youtube.mdx @@ -1,6 +1,6 @@ # YouTube videos -⛓ icon marks a new addition [last update 2023-06-20] +⛓ icon marks a new addition [last update 2023-09-05] ### [Official LangChain YouTube channel](https://www.youtube.com/@LangChain) @@ -86,20 +86,20 @@ - [`Llama Index`: Chat with Documentation using URL Loader](https://youtu.be/XJRoDEctAwA) by [Merk](https://www.youtube.com/@merksworld) - [Using OpenAI, LangChain, and `Gradio` to Build Custom GenAI Applications](https://youtu.be/1MsmqMg3yUc) by [David Hundley](https://www.youtube.com/@dkhundley) - [LangChain, Chroma DB, OpenAI Beginner Guide | ChatGPT with your PDF](https://youtu.be/FuqdVNB_8c0) -- ⛓ [Build AI chatbot with custom knowledge base using OpenAI API and GPT Index](https://youtu.be/vDZAZuaXf48) by [Irina Nik](https://www.youtube.com/@irina_nik) -- ⛓ [Build Your Own Auto-GPT Apps with LangChain (Python Tutorial)](https://youtu.be/NYSWn1ipbgg) by [Dave Ebbelaar](https://www.youtube.com/@daveebbelaar) -- ⛓ [Chat with Multiple `PDFs` | LangChain App Tutorial in Python (Free LLMs and Embeddings)](https://youtu.be/dXxQ0LR-3Hg) by [Alejandro AO - Software & Ai](https://www.youtube.com/@alejandro_ao) -- ⛓ [Chat with a `CSV` | `LangChain Agents` Tutorial (Beginners)](https://youtu.be/tjeti5vXWOU) by [Alejandro AO - Software & Ai](https://www.youtube.com/@alejandro_ao) -- ⛓ [Create Your Own ChatGPT with `PDF` Data in 5 Minutes (LangChain Tutorial)](https://youtu.be/au2WVVGUvc8) by [Liam Ottley](https://www.youtube.com/@LiamOttley) -- ⛓ [Using ChatGPT with YOUR OWN Data. This is magical. (LangChain OpenAI API)](https://youtu.be/9AXP7tCI9PI) by [TechLead](https://www.youtube.com/@TechLead) -- ⛓ [Build a Custom Chatbot with OpenAI: `GPT-Index` & LangChain | Step-by-Step Tutorial](https://youtu.be/FIDv6nc4CgU) by [Fabrikod](https://www.youtube.com/@fabrikod) -- ⛓ [`Flowise` is an open source no-code UI visual tool to build 🦜🔗LangChain applications](https://youtu.be/CovAPtQPU0k) by [Cobus Greyling](https://www.youtube.com/@CobusGreylingZA) -- ⛓ [LangChain & GPT 4 For Data Analysis: The `Pandas` Dataframe Agent](https://youtu.be/rFQ5Kmkd4jc) by [Rabbitmetrics](https://www.youtube.com/@rabbitmetrics) -- ⛓ [`GirlfriendGPT` - AI girlfriend with LangChain](https://youtu.be/LiN3D1QZGQw) by [Toolfinder AI](https://www.youtube.com/@toolfinderai) -- ⛓ [`PrivateGPT`: Chat to your FILES OFFLINE and FREE [Installation and Tutorial]](https://youtu.be/G7iLllmx4qc) by [Prompt Engineering](https://www.youtube.com/@engineerprompt) -- ⛓ [How to build with Langchain 10x easier | ⛓️ LangFlow & `Flowise`](https://youtu.be/Ya1oGL7ZTvU) by [AI Jason](https://www.youtube.com/@AIJasonZ) -- ⛓ [Getting Started With LangChain In 20 Minutes- Build Celebrity Search Application](https://youtu.be/_FpT1cwcSLg) by [Krish Naik](https://www.youtube.com/@krishnaik06) - +- [Build AI chatbot with custom knowledge base using OpenAI API and GPT Index](https://youtu.be/vDZAZuaXf48) by [Irina Nik](https://www.youtube.com/@irina_nik) +- [Build Your Own Auto-GPT Apps with LangChain (Python Tutorial)](https://youtu.be/NYSWn1ipbgg) by [Dave Ebbelaar](https://www.youtube.com/@daveebbelaar) +- [Chat with Multiple `PDFs` | LangChain App Tutorial in Python (Free LLMs and Embeddings)](https://youtu.be/dXxQ0LR-3Hg) by [Alejandro AO - Software & Ai](https://www.youtube.com/@alejandro_ao) +- [Chat with a `CSV` | `LangChain Agents` Tutorial (Beginners)](https://youtu.be/tjeti5vXWOU) by [Alejandro AO - Software & Ai](https://www.youtube.com/@alejandro_ao) +- [Create Your Own ChatGPT with `PDF` Data in 5 Minutes (LangChain Tutorial)](https://youtu.be/au2WVVGUvc8) by [Liam Ottley](https://www.youtube.com/@LiamOttley) +- [Using ChatGPT with YOUR OWN Data. This is magical. (LangChain OpenAI API)](https://youtu.be/9AXP7tCI9PI) by [TechLead](https://www.youtube.com/@TechLead) +- [Build a Custom Chatbot with OpenAI: `GPT-Index` & LangChain | Step-by-Step Tutorial](https://youtu.be/FIDv6nc4CgU) by [Fabrikod](https://www.youtube.com/@fabrikod) +- [`Flowise` is an open source no-code UI visual tool to build 🦜🔗LangChain applications](https://youtu.be/CovAPtQPU0k) by [Cobus Greyling](https://www.youtube.com/@CobusGreylingZA) +- [LangChain & GPT 4 For Data Analysis: The `Pandas` Dataframe Agent](https://youtu.be/rFQ5Kmkd4jc) by [Rabbitmetrics](https://www.youtube.com/@rabbitmetrics) +- [`GirlfriendGPT` - AI girlfriend with LangChain](https://youtu.be/LiN3D1QZGQw) by [Toolfinder AI](https://www.youtube.com/@toolfinderai) +- [`PrivateGPT`: Chat to your FILES OFFLINE and FREE [Installation and Tutorial]](https://youtu.be/G7iLllmx4qc) by [Prompt Engineering](https://www.youtube.com/@engineerprompt) +- [How to build with Langchain 10x easier | ⛓️ LangFlow & `Flowise`](https://youtu.be/Ya1oGL7ZTvU) by [AI Jason](https://www.youtube.com/@AIJasonZ) +- [Getting Started With LangChain In 20 Minutes- Build Celebrity Search Application](https://youtu.be/_FpT1cwcSLg) by [Krish Naik](https://www.youtube.com/@krishnaik06) +- ⛓ [LangChain HowTo and Guides YouTube playlist](https://www.youtube.com/playlist?list=PL8motc6AQftk1Bs42EW45kwYbyJ4jOdiZ) by [Sam Witteveen](https://www.youtube.com/@samwitteveenai/) ### [Prompt Engineering and LangChain](https://www.youtube.com/watch?v=muXbPpG_ys4&list=PLEJK-H61Xlwzm5FYLDdKt_6yibO33zoMW) by [Venelin Valkov](https://www.youtube.com/@venelin_valkov) diff --git a/docs/extras/expression_language/cookbook.ipynb b/docs/extras/expression_language/cookbook.ipynb deleted file mode 100644 index 04b74164dd..0000000000 --- a/docs/extras/expression_language/cookbook.ipynb +++ /dev/null @@ -1,1664 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "9a9acd2e", - "metadata": {}, - "source": [ - "# Cookbook\n", - "\n", - "In this notebook we'll take a look at a few common types of sequences to create." - ] - }, - { - "cell_type": "markdown", - "id": "93aa2c87", - "metadata": {}, - "source": [ - "## PromptTemplate + LLM\n", - "\n", - "A PromptTemplate -> LLM is a core chain that is used in most other larger chains/systems." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "466b65b3", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.prompts import ChatPromptTemplate\n", - "from langchain.chat_models import ChatOpenAI" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "3c634ef0", - "metadata": {}, - "outputs": [], - "source": [ - "model = ChatOpenAI()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "d1850a1f", - "metadata": {}, - "outputs": [], - "source": [ - "prompt = ChatPromptTemplate.from_template(\"tell me a joke about {foo}\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "56d0669f", - "metadata": {}, - "outputs": [], - "source": [ - "chain = prompt | model" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "e3d0a6cd", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AIMessage(content='Why don\\'t bears use cell phones? \\n\\nBecause they always get terrible \"grizzly\" reception!', additional_kwargs={}, example=False)" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"foo\": \"bears\"})" - ] - }, - { - "cell_type": "markdown", - "id": "7eb9ef50", - "metadata": {}, - "source": [ - "Often times we want to attach kwargs to the model that's passed in. Here's a few examples of that:" - ] - }, - { - "cell_type": "markdown", - "id": "0b1d8f88", - "metadata": {}, - "source": [ - "### Attaching Stop Sequences" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "562a06bf", - "metadata": {}, - "outputs": [], - "source": [ - "chain = prompt | model.bind(stop=[\"\\n\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "43f5d04c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AIMessage(content=\"Why don't bears use cell phones?\", additional_kwargs={}, example=False)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"foo\": \"bears\"})" - ] - }, - { - "cell_type": "markdown", - "id": "f3eaf88a", - "metadata": {}, - "source": [ - "### Attaching Function Call information" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "f94b71b2", - "metadata": {}, - "outputs": [], - "source": [ - "functions = [\n", - " {\n", - " \"name\": \"joke\",\n", - " \"description\": \"A joke\",\n", - " \"parameters\": {\n", - " \"type\": \"object\",\n", - " \"properties\": {\n", - " \"setup\": {\n", - " \"type\": \"string\",\n", - " \"description\": \"The setup for the joke\"\n", - " },\n", - " \"punchline\": {\n", - " \"type\": \"string\",\n", - " \"description\": \"The punchline for the joke\"\n", - " }\n", - " },\n", - " \"required\": [\"setup\", \"punchline\"]\n", - " }\n", - " }\n", - " ]\n", - "chain = prompt | model.bind(function_call= {\"name\": \"joke\"}, functions= functions)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "decf7710", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AIMessage(content='', additional_kwargs={'function_call': {'name': 'joke', 'arguments': '{\\n \"setup\": \"Why don\\'t bears wear shoes?\",\\n \"punchline\": \"Because they have bear feet!\"\\n}'}}, example=False)" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"foo\": \"bears\"}, config={})" - ] - }, - { - "cell_type": "markdown", - "id": "9098c5ed", - "metadata": {}, - "source": [ - "## PromptTemplate + LLM + OutputParser\n", - "\n", - "We can also add in an output parser to easily trasform the raw LLM/ChatModel output into a more workable format" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "f799664d", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.schema.output_parser import StrOutputParser" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "cc194c78", - "metadata": {}, - "outputs": [], - "source": [ - "chain = prompt | model | StrOutputParser()" - ] - }, - { - "cell_type": "markdown", - "id": "77acf448", - "metadata": {}, - "source": [ - "Notice that this now returns a string - a much more workable format for downstream tasks" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "e3d69a18", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"Why don't bears wear shoes?\\n\\nBecause they have bear feet!\"" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"foo\": \"bears\"})" - ] - }, - { - "cell_type": "markdown", - "id": "c01864e5", - "metadata": {}, - "source": [ - "### Functions Output Parser\n", - "\n", - "When you specify the function to return, you may just want to parse that directly" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "ad0dd88e", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser\n", - "chain = (\n", - " prompt \n", - " | model.bind(function_call= {\"name\": \"joke\"}, functions= functions) \n", - " | JsonOutputFunctionsParser()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "1e7aa8eb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'setup': \"Why don't bears wear shoes?\",\n", - " 'punchline': 'Because they have bear feet!'}" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"foo\": \"bears\"})" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "d4aa1a01", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser\n", - "chain = (\n", - " prompt \n", - " | model.bind(function_call= {\"name\": \"joke\"}, functions= functions) \n", - " | JsonKeyOutputFunctionsParser(key_name=\"setup\")\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "8b6df9ba", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"Why don't bears like fast food?\"" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"foo\": \"bears\"})" - ] - }, - { - "cell_type": "markdown", - "id": "2ed58136", - "metadata": {}, - "source": [ - "## Passthroughs and itemgetter\n", - "\n", - "Often times when constructing a chain you may want to pass along original input variables to future steps in the chain. How exactly you do this depends on what exactly the input is:\n", - "\n", - "- If the original input was a string, then you likely just want to pass along the string. This can be done with `RunnablePassthrough`. For an example of this, see `LLMChain + Retriever`\n", - "- If the original input was a dictionary, then you likely want to pass along specific keys. This can be done with `itemgetter`. For an example of this see `Multiple LLM Chains`" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "5d3d8ffe", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.schema.runnable import RunnablePassthrough\n", - "from operator import itemgetter" - ] - }, - { - "cell_type": "markdown", - "id": "91c5ef3d", - "metadata": {}, - "source": [ - "## LLMChain + Retriever\n", - "\n", - "Let's now look at adding in a retrieval step, which adds up to a \"retrieval-augmented generation\" chain" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "33be32af", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.vectorstores import Chroma\n", - "from langchain.embeddings import OpenAIEmbeddings\n", - "from langchain.schema.runnable import RunnablePassthrough" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "df3f3fa2", - "metadata": {}, - "outputs": [], - "source": [ - "# Create the retriever\n", - "vectorstore = Chroma.from_texts([\"harrison worked at kensho\"], embedding=OpenAIEmbeddings())\n", - "retriever = vectorstore.as_retriever()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "bfc47ec1", - "metadata": {}, - "outputs": [], - "source": [ - "template = \"\"\"Answer the question based only on the following context:\n", - "{context}\n", - "\n", - "Question: {question}\n", - "\"\"\"\n", - "prompt = ChatPromptTemplate.from_template(template)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "eae31755", - "metadata": {}, - "outputs": [], - "source": [ - "chain = (\n", - " {\"context\": retriever, \"question\": RunnablePassthrough()} \n", - " | prompt \n", - " | model \n", - " | StrOutputParser()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "f3040b0c", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1\n" - ] - }, - { - "data": { - "text/plain": [ - "'Harrison worked at Kensho.'" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke(\"where did harrison work?\")" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "e1d20c7c", - "metadata": {}, - "outputs": [], - "source": [ - "template = \"\"\"Answer the question based only on the following context:\n", - "{context}\n", - "\n", - "Question: {question}\n", - "\n", - "Answer in the following language: {language}\n", - "\"\"\"\n", - "prompt = ChatPromptTemplate.from_template(template)\n", - "\n", - "chain = {\n", - " \"context\": itemgetter(\"question\") | retriever, \n", - " \"question\": itemgetter(\"question\"), \n", - " \"language\": itemgetter(\"language\")\n", - "} | prompt | model | StrOutputParser()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "7ee8b2d4", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1\n" - ] - }, - { - "data": { - "text/plain": [ - "'Harrison ha lavorato a Kensho.'" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"question\": \"where did harrison work\", \"language\": \"italian\"})" - ] - }, - { - "cell_type": "markdown", - "id": "f007669c", - "metadata": {}, - "source": [ - "## Conversational Retrieval Chain\n", - "\n", - "We can easily add in conversation history. This primarily means adding in chat_message_history" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "3f30c348", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.schema.runnable import RunnableMap\n", - "from langchain.schema import format_document" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "64ab1dbf", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.prompts.prompt import PromptTemplate\n", - "\n", - "_template = \"\"\"Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n", - "\n", - "Chat History:\n", - "{chat_history}\n", - "Follow Up Input: {question}\n", - "Standalone question:\"\"\"\n", - "CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "7d628c97", - "metadata": {}, - "outputs": [], - "source": [ - "template = \"\"\"Answer the question based only on the following context:\n", - "{context}\n", - "\n", - "Question: {question}\n", - "\"\"\"\n", - "ANSWER_PROMPT = ChatPromptTemplate.from_template(template)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "f60a5d0f", - "metadata": {}, - "outputs": [], - "source": [ - "DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template=\"{page_content}\")\n", - "def _combine_documents(docs, document_prompt = DEFAULT_DOCUMENT_PROMPT, document_separator=\"\\n\\n\"):\n", - " doc_strings = [format_document(doc, document_prompt) for doc in docs]\n", - " return document_separator.join(doc_strings)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "7d007db6", - "metadata": {}, - "outputs": [], - "source": [ - "from typing import Tuple, List\n", - "def _format_chat_history(chat_history: List[Tuple]) -> str:\n", - " buffer = \"\"\n", - " for dialogue_turn in chat_history:\n", - " human = \"Human: \" + dialogue_turn[0]\n", - " ai = \"Assistant: \" + dialogue_turn[1]\n", - " buffer += \"\\n\" + \"\\n\".join([human, ai])\n", - " return buffer" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "5c32cc89", - "metadata": {}, - "outputs": [], - "source": [ - "_inputs = RunnableMap(\n", - " {\n", - " \"standalone_question\": {\n", - " \"question\": lambda x: x[\"question\"],\n", - " \"chat_history\": lambda x: _format_chat_history(x['chat_history'])\n", - " } | CONDENSE_QUESTION_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),\n", - " }\n", - ")\n", - "_context = {\n", - " \"context\": itemgetter(\"standalone_question\") | retriever | _combine_documents,\n", - " \"question\": lambda x: x[\"standalone_question\"]\n", - "}\n", - "conversational_qa_chain = _inputs | _context | ANSWER_PROMPT | ChatOpenAI()" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "135c8205", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1\n" - ] - }, - { - "data": { - "text/plain": [ - "AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False)" - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "conversational_qa_chain.invoke({\n", - " \"question\": \"where did harrison work?\",\n", - " \"chat_history\": [],\n", - "})" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "424e7e7a", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1\n" - ] - }, - { - "data": { - "text/plain": [ - "AIMessage(content='Harrison worked at Kensho.', additional_kwargs={}, example=False)" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "conversational_qa_chain.invoke({\n", - " \"question\": \"where did he work?\",\n", - " \"chat_history\": [(\"Who wrote this notebook?\", \"Harrison\")],\n", - "})" - ] - }, - { - "cell_type": "markdown", - "id": "c5543183", - "metadata": {}, - "source": [ - "### With Memory and returning source documents\n", - "\n", - "This shows how to use memory with the above. For memory, we need to manage that outside at the memory. For returning the retrieved documents, we just need to pass them through all the way." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "e31dd17c", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.memory import ConversationBufferMemory" - ] - }, - { - "cell_type": "code", - "execution_count": 44, - "id": "d4bffe94", - "metadata": {}, - "outputs": [], - "source": [ - "memory = ConversationBufferMemory(return_messages=True, output_key=\"answer\", input_key=\"question\")" - ] - }, - { - "cell_type": "code", - "execution_count": 45, - "id": "733be985", - "metadata": {}, - "outputs": [], - "source": [ - "# First we add a step to load memory\n", - "# This needs to be a RunnableMap because its the first input\n", - "loaded_memory = RunnableMap(\n", - " {\n", - " \"question\": itemgetter(\"question\"),\n", - " \"memory\": memory.load_memory_variables,\n", - " }\n", - ")\n", - "# Next we add a step to expand memory into the variables\n", - "expanded_memory = {\n", - " \"question\": itemgetter(\"question\"),\n", - " \"chat_history\": lambda x: x[\"memory\"][\"history\"]\n", - "}\n", - "\n", - "# Now we calculate the standalone question\n", - "standalone_question = {\n", - " \"standalone_question\": {\n", - " \"question\": lambda x: x[\"question\"],\n", - " \"chat_history\": lambda x: _format_chat_history(x['chat_history'])\n", - " } | CONDENSE_QUESTION_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),\n", - "}\n", - "# Now we retrieve the documents\n", - "retrieved_documents = {\n", - " \"docs\": itemgetter(\"standalone_question\") | retriever,\n", - " \"question\": lambda x: x[\"standalone_question\"]\n", - "}\n", - "# Now we construct the inputs for the final prompt\n", - "final_inputs = {\n", - " \"context\": lambda x: _combine_documents(x[\"docs\"]),\n", - " \"question\": itemgetter(\"question\")\n", - "}\n", - "# And finally, we do the part that returns the answers\n", - "answer = {\n", - " \"answer\": final_inputs | ANSWER_PROMPT | ChatOpenAI(),\n", - " \"docs\": itemgetter(\"docs\"),\n", - "}\n", - "# And now we put it all together!\n", - "final_chain = loaded_memory | expanded_memory | standalone_question | retrieved_documents | answer" - ] - }, - { - "cell_type": "code", - "execution_count": 46, - "id": "806e390c", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Number of requested results 4 is greater than number of elements in index 1, updating n_results = 1\n" - ] - }, - { - "data": { - "text/plain": [ - "{'answer': AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False),\n", - " 'docs': [Document(page_content='harrison worked at kensho', metadata={})]}" - ] - }, - "execution_count": 46, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "inputs = {\"question\": \"where did harrison work?\"}\n", - "result = final_chain.invoke(inputs)\n", - "result" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "977399fd", - "metadata": {}, - "outputs": [], - "source": [ - "# Note that the memory does not save automatically\n", - "# This will be improved in the future\n", - "# For now you need to save it yourself\n", - "memory.save_context(inputs, {\"answer\": result[\"answer\"].content})" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "f94f7de4", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'history': [HumanMessage(content='where did harrison work?', additional_kwargs={}, example=False),\n", - " AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False)]}" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "memory.load_memory_variables({})" - ] - }, - { - "cell_type": "markdown", - "id": "0f2bf8d3", - "metadata": {}, - "source": [ - "## Multiple LLM Chains\n", - "\n", - "This can also be used to string together multiple LLMChains" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "d65d4e9e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'El país en el que nació la ciudad de Honolulu, Hawái, donde nació Barack Obama, el 44º presidente de los Estados Unidos, es Estados Unidos.'" - ] - }, - "execution_count": 31, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from operator import itemgetter\n", - "\n", - "prompt1 = ChatPromptTemplate.from_template(\"what is the city {person} is from?\")\n", - "prompt2 = ChatPromptTemplate.from_template(\"what country is the city {city} in? respond in {language}\")\n", - "\n", - "chain1 = prompt1 | model | StrOutputParser()\n", - "\n", - "chain2 = {\"city\": chain1, \"language\": itemgetter(\"language\")} | prompt2 | model | StrOutputParser()\n", - "\n", - "chain2.invoke({\"person\": \"obama\", \"language\": \"spanish\"})" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "878f8176", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.schema.runnable import RunnableMap\n", - "prompt1 = ChatPromptTemplate.from_template(\"generate a random color\")\n", - "prompt2 = ChatPromptTemplate.from_template(\"what is a fruit of color: {color}\")\n", - "prompt3 = ChatPromptTemplate.from_template(\"what is countries flag that has the color: {color}\")\n", - "prompt4 = ChatPromptTemplate.from_template(\"What is the color of {fruit} and {country}\")\n", - "chain1 = prompt1 | model | StrOutputParser()\n", - "chain2 = RunnableMap(steps={\"color\": chain1}) | {\n", - " \"fruit\": prompt2 | model | StrOutputParser(),\n", - " \"country\": prompt3 | model | StrOutputParser(),\n", - "} | prompt4" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "d621a870", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "ChatPromptValue(messages=[HumanMessage(content=\"What is the color of A fruit that has a color similar to #7E7DE6 is the Peruvian Apple Cactus (Cereus repandus). It is a tropical fruit with a vibrant purple or violet exterior. and The country's flag that has the color #7E7DE6 is North Macedonia.\", additional_kwargs={}, example=False)])" - ] - }, - "execution_count": 33, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain2.invoke({})" - ] - }, - { - "cell_type": "markdown", - "id": "d094d637", - "metadata": {}, - "source": [ - "## Router\n", - "\n", - "You can also use the router runnable to conditionally route inputs to different runnables." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "252625fd", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.chains import create_tagging_chain_pydantic\n", - "from pydantic import BaseModel, Field\n", - "\n", - "class PromptToUse(BaseModel):\n", - " \"\"\"Used to determine which prompt to use to answer the user's input.\"\"\"\n", - " \n", - " name: str = Field(description=\"Should be one of `math` or `english`\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "57886e84", - "metadata": {}, - "outputs": [], - "source": [ - "tagger = create_tagging_chain_pydantic(PromptToUse, ChatOpenAI(temperature=0))" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "a303b089", - "metadata": {}, - "outputs": [], - "source": [ - "chain1 = ChatPromptTemplate.from_template(\"You are a math genius. Answer the question: {question}\") | ChatOpenAI()\n", - "chain2 = ChatPromptTemplate.from_template(\"You are an english major. Answer the question: {question}\") | ChatOpenAI()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "7aa9ea06", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.schema.runnable import RouterRunnable\n", - "router = RouterRunnable({\"math\": chain1, \"english\": chain2})" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "6a3d3f5d", - "metadata": {}, - "outputs": [], - "source": [ - "chain = {\n", - " \"key\": {\"input\": lambda x: x[\"question\"]} | tagger | (lambda x: x['text'].name),\n", - " \"input\": {\"question\": lambda x: x[\"question\"]}\n", - "} | router" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "8aeda930", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AIMessage(content='Thank you for the compliment! The sum of 2 + 2 is equal to 4.', additional_kwargs={}, example=False)" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"question\": \"whats 2 + 2\"})" - ] - }, - { - "cell_type": "markdown", - "id": "29781123", - "metadata": {}, - "source": [ - "## Tools\n", - "\n", - "You can use any LangChain tool easily" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "9232d2a9", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/harrisonchase/.pyenv/versions/3.9.1/envs/langchain/lib/python3.9/site-packages/deeplake/util/check_latest_version.py:32: UserWarning: A newer version of deeplake (3.6.14) is available. It's recommended that you update to the latest version using `pip install -U deeplake`.\n", - " warnings.warn(\n" - ] - } - ], - "source": [ - "from langchain.tools import DuckDuckGoSearchRun" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a0c64d2c", - "metadata": {}, - "outputs": [], - "source": [ - "search = DuckDuckGoSearchRun()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "391969b6", - "metadata": {}, - "outputs": [], - "source": [ - "template = \"\"\"turn the following user input into a search query for a search engine:\n", - "\n", - "{input}\"\"\"\n", - "prompt = ChatPromptTemplate.from_template(template)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "e3d9d20d", - "metadata": {}, - "outputs": [], - "source": [ - "chain = prompt | model | StrOutputParser() | search" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "55f2967d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "\"What sports games are on TV today & tonight? Watch and stream live sports on TV today, tonight, tomorrow. Today's 2023 sports TV schedule includes football, basketball, baseball, hockey, motorsports, soccer and more. Watch on TV or stream online on ESPN, FOX, FS1, CBS, NBC, ABC, Peacock, Paramount+, fuboTV, local channels and many other networks. Weather Alerts Alerts Bar. Not all offers available in all states, please visit BetMGM for the latest promotions for your area. Must be 21+ to gamble, please wager responsibly. If you or someone ... Speak of the Devils. Good Morning Arizona. Happy Hour Spots. Jaime's Local Love. Surprise Squad. Silver Apple. Field Trip Friday. Seen on TV. Arizona Highways TV. MLB Games Tonight: How to Watch on TV, Streaming & Odds - Friday, July 28. San Diego Padres' Juan Soto plays during the first baseball game in a doubleheader, Saturday, July 15, 2023, in Philadelphia. (AP Photo/Matt Slocum) (APMedia) Today's MLB schedule features top teams in action. Among those games is the Texas Rangers playing the San Diego ... TV. Cleveland at Chi. White Sox. 1:10pm. Bally Sports. NBCS-CHI. Cleveland Guardians (50-51) are second place in AL Central and Chicago White Sox (41-61) are fourth place in AL Central. The Guardians are 23-27 on the road this season and White Sox are 21-26 at home. Chi. Cubs at St. Louis.\"" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"input\": \"I'd like to figure out what games are tonight\"})" - ] - }, - { - "cell_type": "markdown", - "id": "fbc4bf6e", - "metadata": {}, - "source": [ - "## Arbitrary Functions\n", - "\n", - "You can use arbitrary functions in the pipeline\n", - "\n", - "Note that all inputs to these functions need to be a SINGLE argument. If you have a function that accepts multiple arguments, you should write a wrapper that accepts a single input and unpacks it into multiple argument." - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "6bb221b3", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.schema.runnable import RunnableLambda\n", - "\n", - "def length_function(text):\n", - " return len(text)\n", - "\n", - "def _multiple_length_function(text1, text2):\n", - " return len(text1) * len(text2)\n", - "\n", - "def multiple_length_function(_dict):\n", - " return _multiple_length_function(_dict[\"text1\"], _dict[\"text2\"])\n", - "\n", - "prompt = ChatPromptTemplate.from_template(\"what is {a} + {b}\")\n", - "\n", - "chain1 = prompt | model\n", - "\n", - "chain = {\n", - " \"a\": itemgetter(\"foo\") | RunnableLambda(length_function),\n", - " \"b\": {\"text1\": itemgetter(\"foo\"), \"text2\": itemgetter(\"bar\")} | RunnableLambda(multiple_length_function)\n", - "} | prompt | model" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "5488ec85", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AIMessage(content='3 + 9 is equal to 12.', additional_kwargs={}, example=False)" - ] - }, - "execution_count": 36, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"foo\": \"bar\", \"bar\": \"gah\"})" - ] - }, - { - "cell_type": "markdown", - "id": "506e9636", - "metadata": {}, - "source": [ - "## SQL Database\n", - "\n", - "We can also try to replicate our SQLDatabaseChain using this style." - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "7a927516", - "metadata": {}, - "outputs": [], - "source": [ - "template = \"\"\"Based on the table schema below, write a SQL query that would answer the user's question:\n", - "{schema}\n", - "\n", - "Question: {question}\"\"\"\n", - "prompt = ChatPromptTemplate.from_template(template)" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "3f51f386", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.utilities import SQLDatabase" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "2ccca6fc", - "metadata": {}, - "outputs": [], - "source": [ - "db = SQLDatabase.from_uri(\"sqlite:///../../../../notebooks/Chinook.db\")" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "05ba88ee", - "metadata": {}, - "outputs": [], - "source": [ - "def get_schema(_):\n", - " return db.get_table_info()" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "a4eda902", - "metadata": {}, - "outputs": [], - "source": [ - "def run_query(query):\n", - " return db.run(query)" - ] - }, - { - "cell_type": "code", - "execution_count": 47, - "id": "5046cb17", - "metadata": {}, - "outputs": [], - "source": [ - "inputs = {\n", - " \"schema\": RunnableLambda(get_schema),\n", - " \"question\": itemgetter(\"question\")\n", - "}\n", - "sql_response = (\n", - " RunnableMap(inputs)\n", - " | prompt\n", - " | model.bind(stop=[\"\\nSQLResult:\"])\n", - " | StrOutputParser()\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": 48, - "id": "a5552039", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'SELECT COUNT(*) \\nFROM Employee;'" - ] - }, - "execution_count": 48, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sql_response.invoke({\"question\": \"How many employees are there?\"})" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "d6fee130", - "metadata": {}, - "outputs": [], - "source": [ - "template = \"\"\"Based on the table schema below, question, sql query, and sql response, write a natural language response:\n", - "{schema}\n", - "\n", - "Question: {question}\n", - "SQL Query: {query}\n", - "SQL Response: {response}\"\"\"\n", - "prompt_response = ChatPromptTemplate.from_template(template)" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "923aa634", - "metadata": {}, - "outputs": [], - "source": [ - "full_chain = (\n", - " RunnableMap({\n", - " \"question\": itemgetter(\"question\"),\n", - " \"query\": sql_response,\n", - " }) \n", - " | {\n", - " \"schema\": RunnableLambda(get_schema),\n", - " \"question\": itemgetter(\"question\"),\n", - " \"query\": itemgetter(\"query\"),\n", - " \"response\": lambda x: db.run(x[\"query\"]) \n", - " } \n", - " | prompt_response \n", - " | model\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "id": "e94963d8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AIMessage(content='There are 8 employees.', additional_kwargs={}, example=False)" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "full_chain.invoke({\"question\": \"How many employees are there?\"})" - ] - }, - { - "cell_type": "markdown", - "id": "f09fd305", - "metadata": {}, - "source": [ - "## Code Writing" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "bd7c259a", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.utilities import PythonREPL\n", - "from langchain.prompts import SystemMessagePromptTemplate, HumanMessagePromptTemplate" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "id": "73795d2d", - "metadata": {}, - "outputs": [], - "source": [ - "template = \"\"\"Write some python code to solve the user's problem. \n", - "\n", - "Return only python code in Markdown format, e.g.:\n", - "\n", - "```python\n", - "....\n", - "```\"\"\"\n", - "prompt = ChatPromptTemplate(messages=[\n", - " SystemMessagePromptTemplate.from_template(template),\n", - " HumanMessagePromptTemplate.from_template(\"{input}\")\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "id": "42859e8a", - "metadata": {}, - "outputs": [], - "source": [ - "def _sanitize_output(text: str):\n", - " _, after = text.split(\"```python\")\n", - " return after.split(\"```\")[0]" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "id": "5ded1a86", - "metadata": {}, - "outputs": [], - "source": [ - "chain = prompt | model | StrOutputParser() | _sanitize_output | PythonREPL().run" - ] - }, - { - "cell_type": "code", - "execution_count": 68, - "id": "208c2b75", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Python REPL can execute arbitrary code. Use with caution.\n" - ] - }, - { - "data": { - "text/plain": [ - "'4\\n'" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"input\": \"whats 2 plus 2\"})" - ] - }, - { - "cell_type": "markdown", - "id": "5062941a", - "metadata": {}, - "source": [ - "## Memory\n", - "\n", - "This shows how to add memory to an arbitrary chain. Right now, you can use the memory classes but need to hook it up manually" - ] - }, - { - "cell_type": "code", - "execution_count": 99, - "id": "7998efd8", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.memory import ConversationBufferMemory\n", - "from langchain.schema.runnable import RunnableMap\n", - "from langchain.prompts import MessagesPlaceholder\n", - "model = ChatOpenAI()\n", - "prompt = ChatPromptTemplate.from_messages([\n", - " (\"system\", \"You are a helpful chatbot\"),\n", - " MessagesPlaceholder(variable_name=\"history\"),\n", - " (\"human\", \"{input}\")\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": 100, - "id": "fa0087f3", - "metadata": {}, - "outputs": [], - "source": [ - "memory = ConversationBufferMemory(return_messages=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 101, - "id": "06b531ae", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'history': []}" - ] - }, - "execution_count": 101, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "memory.load_memory_variables({})" - ] - }, - { - "cell_type": "code", - "execution_count": 102, - "id": "d9437af6", - "metadata": {}, - "outputs": [], - "source": [ - "chain = RunnableMap({\n", - " \"input\": lambda x: x[\"input\"],\n", - " \"memory\": memory.load_memory_variables\n", - "}) | {\n", - " \"input\": lambda x: x[\"input\"],\n", - " \"history\": lambda x: x[\"memory\"][\"history\"]\n", - "} | prompt | model" - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "id": "bed1e260", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AIMessage(content='Hello Bob! How can I assist you today?', additional_kwargs={}, example=False)" - ] - }, - "execution_count": 103, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "inputs = {\"input\": \"hi im bob\"}\n", - "response = chain.invoke(inputs)\n", - "response" - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "id": "890475b4", - "metadata": {}, - "outputs": [], - "source": [ - "memory.save_context(inputs, {\"output\": response.content})" - ] - }, - { - "cell_type": "code", - "execution_count": 105, - "id": "e8fcb77f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'history': [HumanMessage(content='hi im bob', additional_kwargs={}, example=False),\n", - " AIMessage(content='Hello Bob! How can I assist you today?', additional_kwargs={}, example=False)]}" - ] - }, - "execution_count": 105, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "memory.load_memory_variables({})" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "id": "d837d5c3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "AIMessage(content='Your name is Bob. You mentioned it in your previous message. Is there anything else I can help you with, Bob?', additional_kwargs={}, example=False)" - ] - }, - "execution_count": 106, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "inputs = {\"input\": \"whats my name\"}\n", - "response = chain.invoke(inputs)\n", - "response" - ] - }, - { - "cell_type": "markdown", - "id": "4927a727-b4c8-453c-8c83-bd87b4fcac14", - "metadata": {}, - "source": [ - "## Moderation\n", - "\n", - "This shows how to add in moderation (or other safeguards) around your LLM application." - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "4f5f6449-940a-4f5c-97c0-39b71c3e2a68", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.chains import OpenAIModerationChain\n", - "from langchain.llms import OpenAI" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "fcb8312b-7e7a-424f-a3ec-76738c9a9d21", - "metadata": {}, - "outputs": [], - "source": [ - "moderate = OpenAIModerationChain()" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "b24b9148-f6b0-4091-8ea8-d3fb281bd950", - "metadata": {}, - "outputs": [], - "source": [ - "model = OpenAI()\n", - "prompt = ChatPromptTemplate.from_messages([\n", - " (\"system\", \"repeat after me: {input}\")\n", - "])" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "1c8ed87c-9ca6-4559-bf60-d40e94a0af08", - "metadata": {}, - "outputs": [], - "source": [ - "chain = prompt | model" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "5256b9bd-381a-42b0-bfa8-7e6d18f853cb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\n\\nYou are stupid.'" - ] - }, - "execution_count": 34, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "chain.invoke({\"input\": \"you are stupid\"})" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "fe6e3b33-dc9a-49d5-b194-ba750c58a628", - "metadata": {}, - "outputs": [], - "source": [ - "moderated_chain = chain | moderate" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "id": "d8ba0cbd-c739-4d23-be9f-6ae092bd5ffb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'input': '\\n\\nYou are stupid.',\n", - " 'output': \"Text was found that violates OpenAI's content policy.\"}" - ] - }, - "execution_count": 37, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "moderated_chain.invoke({\"input\": \"you are stupid\"})" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.1" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/extras/expression_language/cookbook/agent.ipynb b/docs/extras/expression_language/cookbook/agent.ipynb new file mode 100644 index 0000000000..5be6b9d4d1 --- /dev/null +++ b/docs/extras/expression_language/cookbook/agent.ipynb @@ -0,0 +1,203 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e89f490d", + "metadata": {}, + "source": [ + "# Agents\n", + "\n", + "You can pass a Runnable into an agent." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "af4381de", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.agents import XMLAgent, tool, AgentExecutor\n", + "from langchain.chat_models import ChatAnthropic" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "24cc8134", + "metadata": {}, + "outputs": [], + "source": [ + "model = ChatAnthropic(model=\"claude-2\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "67c0b0e4", + "metadata": {}, + "outputs": [], + "source": [ + "@tool\n", + "def search(query: str) -> str:\n", + " \"\"\"Search things about current events.\"\"\"\n", + " return \"32 degrees\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "7203b101", + "metadata": {}, + "outputs": [], + "source": [ + "tool_list = [search]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "b68e756d", + "metadata": {}, + "outputs": [], + "source": [ + "# Get prompt to use\n", + "prompt = XMLAgent.get_default_prompt()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "61ab3e9a", + "metadata": {}, + "outputs": [], + "source": [ + "# Logic for going from intermediate steps to a string to pass into model\n", + "# This is pretty tied to the prompt\n", + "def convert_intermediate_steps(intermediate_steps):\n", + " log = \"\"\n", + " for action, observation in intermediate_steps:\n", + " log += (\n", + " f\"{action.tool}{action.tool_input}\"\n", + " f\"{observation}\"\n", + " )\n", + " return log\n", + "\n", + "\n", + "# Logic for converting tools to string to go in prompt\n", + "def convert_tools(tools):\n", + " return \"\\n\".join([f\"{tool.name}: {tool.description}\" for tool in tools])" + ] + }, + { + "cell_type": "markdown", + "id": "260f5988", + "metadata": {}, + "source": [ + "Building an agent from a runnable usually involves a few things:\n", + "\n", + "1. Data processing for the intermediate steps. These need to represented in a way that the language model can recognize them. This should be pretty tightly coupled to the instructions in the prompt\n", + "\n", + "2. The prompt itself\n", + "\n", + "3. The model, complete with stop tokens if needed\n", + "\n", + "4. The output parser - should be in sync with how the prompt specifies things to be formatted." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e92f1d6f", + "metadata": {}, + "outputs": [], + "source": [ + "agent = (\n", + " {\n", + " \"question\": lambda x: x[\"question\"],\n", + " \"intermediate_steps\": lambda x: convert_intermediate_steps(x[\"intermediate_steps\"])\n", + " }\n", + " | prompt.partial(tools=convert_tools(tool_list))\n", + " | model.bind(stop=[\"\", \"\"])\n", + " | XMLAgent.get_default_output_parser()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "6ce6ec7a", + "metadata": {}, + "outputs": [], + "source": [ + "agent_executor = AgentExecutor(agent=agent, tools=tool_list, verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "fb5cb2e3", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", + "\u001b[32;1m\u001b[1;3m search\n", + "weather in new york\u001b[0m\u001b[36;1m\u001b[1;3m32 degrees\u001b[0m\u001b[32;1m\u001b[1;3m\n", + "\n", + "The weather in New York is 32 degrees\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "{'question': 'whats the weather in New york?',\n", + " 'output': 'The weather in New York is 32 degrees'}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agent_executor.invoke({\"question\": \"whats the weather in New york?\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bce86dd8", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/cookbook/code_writing.ipynb b/docs/extras/expression_language/cookbook/code_writing.ipynb new file mode 100644 index 0000000000..25b039ce44 --- /dev/null +++ b/docs/extras/expression_language/cookbook/code_writing.ipynb @@ -0,0 +1,119 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "f09fd305", + "metadata": {}, + "source": [ + "# Code writing\n", + "\n", + "Example of how to use LCEL to write Python code." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "bd7c259a", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate\n", + "from langchain.schema.output_parser import StrOutputParser\n", + "from langchain.utilities import PythonREPL" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "73795d2d", + "metadata": {}, + "outputs": [], + "source": [ + "template = \"\"\"Write some python code to solve the user's problem. \n", + "\n", + "Return only python code in Markdown format, e.g.:\n", + "\n", + "```python\n", + "....\n", + "```\"\"\"\n", + "prompt = ChatPromptTemplate.from_messages(\n", + " [(\"system\", template), (\"human\", \"{input}\")]\n", + ")\n", + "\n", + "model = ChatOpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "42859e8a", + "metadata": {}, + "outputs": [], + "source": [ + "def _sanitize_output(text: str):\n", + " _, after = text.split(\"```python\")\n", + " return after.split(\"```\")[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "5ded1a86", + "metadata": {}, + "outputs": [], + "source": [ + "chain = prompt | model | StrOutputParser() | _sanitize_output | PythonREPL().run" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "208c2b75", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Python REPL can execute arbitrary code. Use with caution.\n" + ] + }, + { + "data": { + "text/plain": [ + "'4\\n'" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"input\": \"whats 2 plus 2\"})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/cookbook/index.mdx b/docs/extras/expression_language/cookbook/index.mdx new file mode 100644 index 0000000000..6310fd50b9 --- /dev/null +++ b/docs/extras/expression_language/cookbook/index.mdx @@ -0,0 +1,11 @@ +--- +sidebar_position: 2 +--- + +# Cookbook + +import DocCardList from "@theme/DocCardList"; + +Example code for accomplishing common tasks with the LangChain Expression Language (LCEL). These examples show how to compose different Runnable (the core LCEL interface) components to achieve various tasks. If you're just getting acquainted with LCEL, the [Prompt + LLM](/docs/expression_language/cookbook/prompt_llm_parser) page is a good place to start. + + \ No newline at end of file diff --git a/docs/extras/expression_language/cookbook/memory.ipynb b/docs/extras/expression_language/cookbook/memory.ipynb new file mode 100644 index 0000000000..bef7e5ed01 --- /dev/null +++ b/docs/extras/expression_language/cookbook/memory.ipynb @@ -0,0 +1,180 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5062941a", + "metadata": {}, + "source": [ + "# Adding memory\n", + "\n", + "This shows how to add memory to an arbitrary chain. Right now, you can use the memory classes but need to hook it up manually" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7998efd8", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.memory import ConversationBufferMemory\n", + "from langchain.schema.runnable import RunnableMap\n", + "from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n", + "\n", + "model = ChatOpenAI()\n", + "prompt = ChatPromptTemplate.from_messages([\n", + " (\"system\", \"You are a helpful chatbot\"),\n", + " MessagesPlaceholder(variable_name=\"history\"),\n", + " (\"human\", \"{input}\")\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "fa0087f3", + "metadata": {}, + "outputs": [], + "source": [ + "memory = ConversationBufferMemory(return_messages=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "06b531ae", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'history': []}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "memory.load_memory_variables({})" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d9437af6", + "metadata": {}, + "outputs": [], + "source": [ + "chain = RunnableMap({\n", + " \"input\": lambda x: x[\"input\"],\n", + " \"memory\": memory.load_memory_variables\n", + "}) | {\n", + " \"input\": lambda x: x[\"input\"],\n", + " \"history\": lambda x: x[\"memory\"][\"history\"]\n", + "} | prompt | model" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bed1e260", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='Hello Bob! How can I assist you today?', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs = {\"input\": \"hi im bob\"}\n", + "response = chain.invoke(inputs)\n", + "response" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "890475b4", + "metadata": {}, + "outputs": [], + "source": [ + "memory.save_context(inputs, {\"output\": response.content})" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "e8fcb77f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'history': [HumanMessage(content='hi im bob', additional_kwargs={}, example=False),\n", + " AIMessage(content='Hello Bob! How can I assist you today?', additional_kwargs={}, example=False)]}" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "memory.load_memory_variables({})" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "d837d5c3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='Your name is Bob.', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs = {\"input\": \"whats my name\"}\n", + "response = chain.invoke(inputs)\n", + "response" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/cookbook/moderation.ipynb b/docs/extras/expression_language/cookbook/moderation.ipynb new file mode 100644 index 0000000000..cb4114d8e9 --- /dev/null +++ b/docs/extras/expression_language/cookbook/moderation.ipynb @@ -0,0 +1,133 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "4927a727-b4c8-453c-8c83-bd87b4fcac14", + "metadata": {}, + "source": [ + "# Adding moderation\n", + "\n", + "This shows how to add in moderation (or other safeguards) around your LLM application." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "4f5f6449-940a-4f5c-97c0-39b71c3e2a68", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains import OpenAIModerationChain\n", + "from langchain.llms import OpenAI\n", + "from langchain.prompts import ChatPromptTemplate" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fcb8312b-7e7a-424f-a3ec-76738c9a9d21", + "metadata": {}, + "outputs": [], + "source": [ + "moderate = OpenAIModerationChain()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "b24b9148-f6b0-4091-8ea8-d3fb281bd950", + "metadata": {}, + "outputs": [], + "source": [ + "model = OpenAI()\n", + "prompt = ChatPromptTemplate.from_messages([\n", + " (\"system\", \"repeat after me: {input}\")\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "1c8ed87c-9ca6-4559-bf60-d40e94a0af08", + "metadata": {}, + "outputs": [], + "source": [ + "chain = prompt | model" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "5256b9bd-381a-42b0-bfa8-7e6d18f853cb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\n\\nYou are stupid.'" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"input\": \"you are stupid\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "fe6e3b33-dc9a-49d5-b194-ba750c58a628", + "metadata": {}, + "outputs": [], + "source": [ + "moderated_chain = chain | moderate" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "d8ba0cbd-c739-4d23-be9f-6ae092bd5ffb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'input': '\\n\\nYou are stupid',\n", + " 'output': \"Text was found that violates OpenAI's content policy.\"}" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "moderated_chain.invoke({\"input\": \"you are stupid\"})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/cookbook/multiple_chains.ipynb b/docs/extras/expression_language/cookbook/multiple_chains.ipynb new file mode 100644 index 0000000000..7db06a85f5 --- /dev/null +++ b/docs/extras/expression_language/cookbook/multiple_chains.ipynb @@ -0,0 +1,240 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "877102d1-02ea-4fa3-8ec7-a08e242b95b3", + "metadata": {}, + "source": [ + "---\n", + "sidebar_position: 2\n", + "title: Multiple chains\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "0f2bf8d3", + "metadata": {}, + "source": [ + "Runnables can easily be used to string together multiple Chains" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d65d4e9e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'El país donde se encuentra la ciudad de Honolulu, donde nació Barack Obama, el 44º Presidente de los Estados Unidos, es Estados Unidos. Honolulu se encuentra en la isla de Oahu, en el estado de Hawái.'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from operator import itemgetter\n", + "\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.prompts import ChatPromptTemplate\n", + "from langchain.schema import StrOutputParser\n", + "\n", + "prompt1 = ChatPromptTemplate.from_template(\"what is the city {person} is from?\")\n", + "prompt2 = ChatPromptTemplate.from_template(\"what country is the city {city} in? respond in {language}\")\n", + "\n", + "model = ChatOpenAI()\n", + "\n", + "chain1 = prompt1 | model | StrOutputParser()\n", + "\n", + "chain2 = {\"city\": chain1, \"language\": itemgetter(\"language\")} | prompt2 | model | StrOutputParser()\n", + "\n", + "chain2.invoke({\"person\": \"obama\", \"language\": \"spanish\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "878f8176", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema.runnable import RunnableMap, RunnablePassthrough\n", + "\n", + "prompt1 = ChatPromptTemplate.from_template(\"generate a {attribute} color. Return the name of the color and nothing else:\")\n", + "prompt2 = ChatPromptTemplate.from_template(\"what is a fruit of color: {color}. Return the name of the fruit and nothing else:\")\n", + "prompt3 = ChatPromptTemplate.from_template(\"what is a country with a flag that has the color: {color}. Return the name of the country and nothing else:\")\n", + "prompt4 = ChatPromptTemplate.from_template(\"What is the color of {fruit} and the flag of {country}?\")\n", + "\n", + "model_parser = model | StrOutputParser()\n", + "\n", + "color_generator = {\"attribute\": RunnablePassthrough()} | prompt1 | {\"color\": model_parser}\n", + "color_to_fruit = prompt2 | model_parser\n", + "color_to_country = prompt3 | model_parser\n", + "question_generator = color_generator | {\"fruit\": color_to_fruit, \"country\": color_to_country} | prompt4" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d621a870", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "ChatPromptValue(messages=[HumanMessage(content='What is the color of strawberry and the flag of China?', additional_kwargs={}, example=False)])" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "question_generator.invoke({\"warm\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b4a9812b-bead-4fd9-ae27-0b8be57e5dc1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='The color of an apple is typically red or green. The flag of China is predominantly red with a large yellow star in the upper left corner and four smaller yellow stars surrounding it.', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "prompt = question_generator.invoke({\"warm\"})\n", + "model.invoke(prompt)" + ] + }, + { + "cell_type": "markdown", + "id": "6d75a313-f1c8-4e94-9a17-24e0bf4a2bdc", + "metadata": {}, + "source": [ + "### Branching and Merging\n", + "\n", + "You may want the output of one component to be processed by 2 or more other components. [RunnableMaps](https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.base.RunnableMap.html) let you split or fork the chain so multiple components can process the input in parallel. Later, other components can join or merge the results to synthesize a final response. This type of chain creates a computation graph that looks like the following:\n", + "\n", + "```text\n", + " Input\n", + " / \\\n", + " / \\\n", + " Branch1 Branch2\n", + " \\ /\n", + " \\ /\n", + " Combine\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "247fa0bd-4596-4063-8cb3-1d7fc119d982", + "metadata": {}, + "outputs": [], + "source": [ + "planner = (\n", + " ChatPromptTemplate.from_template(\n", + " \"Generate an argument about: {input}\"\n", + " )\n", + " | ChatOpenAI()\n", + " | StrOutputParser()\n", + " | {\"base_response\": RunnablePassthrough()}\n", + ")\n", + "\n", + "arguments_for = (\n", + " ChatPromptTemplate.from_template(\n", + " \"List the pros or positive aspects of {base_response}\"\n", + " )\n", + " | ChatOpenAI()\n", + " | StrOutputParser()\n", + ")\n", + "arguments_against = (\n", + " ChatPromptTemplate.from_template(\n", + " \"List the cons or negative aspects of {base_response}\"\n", + " )\n", + " | ChatOpenAI()\n", + " | StrOutputParser()\n", + ")\n", + "\n", + "final_responder = (\n", + " ChatPromptTemplate.from_messages(\n", + " [\n", + " (\"ai\", \"{original_response}\"),\n", + " (\"human\", \"Pros:\\n{results_1}\\n\\nCons:\\n{results_2}\"),\n", + " (\"system\", \"Generate a final response given the critique\"),\n", + " ]\n", + " )\n", + " | ChatOpenAI()\n", + " | StrOutputParser()\n", + ")\n", + "\n", + "chain = (\n", + " planner \n", + " | {\n", + " \"results_1\": arguments_for,\n", + " \"results_2\": arguments_against,\n", + " \"original_response\": itemgetter(\"base_response\"),\n", + " }\n", + " | final_responder\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "2564f310-0674-4bb1-9c4e-d7848ca73511", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'While Scrum has its potential cons and challenges, many organizations have successfully embraced and implemented this project management framework to great effect. The cons mentioned above can be mitigated or overcome with proper training, support, and a commitment to continuous improvement. It is also important to note that not all cons may be applicable to every organization or project.\\n\\nFor example, while Scrum may be complex initially, with proper training and guidance, teams can quickly grasp the concepts and practices. The lack of predictability can be mitigated by implementing techniques such as velocity tracking and release planning. The limited documentation can be addressed by maintaining a balance between lightweight documentation and clear communication among team members. The dependency on team collaboration can be improved through effective communication channels and regular team-building activities.\\n\\nScrum can be scaled and adapted to larger projects by using frameworks like Scrum of Scrums or LeSS (Large Scale Scrum). Concerns about speed versus quality can be addressed by incorporating quality assurance practices, such as continuous integration and automated testing, into the Scrum process. Scope creep can be managed by having a well-defined and prioritized product backlog, and a strong product owner can be developed through training and mentorship.\\n\\nResistance to change can be overcome by providing proper education and communication to stakeholders and involving them in the decision-making process. Ultimately, the cons of Scrum can be seen as opportunities for growth and improvement, and with the right mindset and support, they can be effectively managed.\\n\\nIn conclusion, while Scrum may have its challenges and potential cons, the benefits and advantages it offers in terms of collaboration, flexibility, adaptability, transparency, and customer satisfaction make it a widely adopted and successful project management framework. With proper implementation and continuous improvement, organizations can leverage Scrum to drive innovation, efficiency, and project success.'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"input\": \"scrum\"})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "poetry-venv", + "language": "python", + "name": "poetry-venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/cookbook/prompt_llm_parser.ipynb b/docs/extras/expression_language/cookbook/prompt_llm_parser.ipynb new file mode 100644 index 0000000000..1b670904d5 --- /dev/null +++ b/docs/extras/expression_language/cookbook/prompt_llm_parser.ipynb @@ -0,0 +1,431 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "abf7263d-3a62-4016-b5d5-b157f92f2070", + "metadata": {}, + "source": [ + "---\n", + "sidebar_position: 0\n", + "title: Prompt + LLM\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "9a434f2b-9405-468c-9dfd-254d456b57a6", + "metadata": {}, + "source": [ + "The most common and valuable composition is taking:\n", + "\n", + "``PromptTemplate`` / ``ChatPromptTemplate`` -> ``LLM`` / ``ChatModel`` -> ``OutputParser``\n", + "\n", + "Almost any other chains you build will use this building block." + ] + }, + { + "cell_type": "markdown", + "id": "93aa2c87", + "metadata": {}, + "source": [ + "## PromptTemplate + LLM\n", + "\n", + "The simplest composition is just combing a prompt and model to create a chain that takes user input, adds it to a prompt, passes it to a model, and returns the raw model input.\n", + "\n", + "Note, you can mix and match PromptTemplate/ChatPromptTemplates and LLMs/ChatModels as you like here." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "466b65b3", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.prompts import ChatPromptTemplate\n", + "from langchain.chat_models import ChatOpenAI\n", + "\n", + "prompt = ChatPromptTemplate.from_template(\"tell me a joke about {foo}\")\n", + "model = ChatOpenAI()\n", + "chain = prompt | model" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "e3d0a6cd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content=\"Why don't bears wear shoes?\\n\\nBecause they have bear feet!\", additional_kwargs={}, example=False)" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"foo\": \"bears\"})" + ] + }, + { + "cell_type": "markdown", + "id": "7eb9ef50", + "metadata": {}, + "source": [ + "Often times we want to attach kwargs that'll be passed to each model call. Here's a few examples of that:" + ] + }, + { + "cell_type": "markdown", + "id": "0b1d8f88", + "metadata": {}, + "source": [ + "### Attaching Stop Sequences" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "562a06bf", + "metadata": {}, + "outputs": [], + "source": [ + "chain = prompt | model.bind(stop=[\"\\n\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "43f5d04c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='Why did the bear never wear shoes?', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"foo\": \"bears\"})" + ] + }, + { + "cell_type": "markdown", + "id": "f3eaf88a", + "metadata": {}, + "source": [ + "### Attaching Function Call information" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f94b71b2", + "metadata": {}, + "outputs": [], + "source": [ + "functions = [\n", + " {\n", + " \"name\": \"joke\",\n", + " \"description\": \"A joke\",\n", + " \"parameters\": {\n", + " \"type\": \"object\",\n", + " \"properties\": {\n", + " \"setup\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The setup for the joke\"\n", + " },\n", + " \"punchline\": {\n", + " \"type\": \"string\",\n", + " \"description\": \"The punchline for the joke\"\n", + " }\n", + " },\n", + " \"required\": [\"setup\", \"punchline\"]\n", + " }\n", + " }\n", + " ]\n", + "chain = prompt | model.bind(function_call= {\"name\": \"joke\"}, functions= functions)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "decf7710", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='', additional_kwargs={'function_call': {'name': 'joke', 'arguments': '{\\n \"setup\": \"Why don\\'t bears wear shoes?\",\\n \"punchline\": \"Because they have bear feet!\"\\n}'}}, example=False)" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"foo\": \"bears\"}, config={})" + ] + }, + { + "cell_type": "markdown", + "id": "9098c5ed", + "metadata": {}, + "source": [ + "## PromptTemplate + LLM + OutputParser\n", + "\n", + "We can also add in an output parser to easily trasform the raw LLM/ChatModel output into a more workable format" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "cc194c78", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema.output_parser import StrOutputParser\n", + "\n", + "chain = prompt | model | StrOutputParser()" + ] + }, + { + "cell_type": "markdown", + "id": "77acf448", + "metadata": {}, + "source": [ + "Notice that this now returns a string - a much more workable format for downstream tasks" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e3d69a18", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"Why don't bears wear shoes?\\n\\nBecause they have bear feet!\"" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"foo\": \"bears\"})" + ] + }, + { + "cell_type": "markdown", + "id": "c01864e5", + "metadata": {}, + "source": [ + "### Functions Output Parser\n", + "\n", + "When you specify the function to return, you may just want to parse that directly" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "ad0dd88e", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser\n", + "\n", + "chain = (\n", + " prompt \n", + " | model.bind(function_call= {\"name\": \"joke\"}, functions= functions) \n", + " | JsonOutputFunctionsParser()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "1e7aa8eb", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'setup': \"Why don't bears like fast food?\",\n", + " 'punchline': \"Because they can't catch it!\"}" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"foo\": \"bears\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d4aa1a01", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser\n", + "\n", + "chain = (\n", + " prompt \n", + " | model.bind(function_call= {\"name\": \"joke\"}, functions= functions) \n", + " | JsonKeyOutputFunctionsParser(key_name=\"setup\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8b6df9ba", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"Why don't bears wear shoes?\"" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"foo\": \"bears\"})" + ] + }, + { + "cell_type": "markdown", + "id": "023fbccb-ef7d-489e-a9ba-f98e17283d51", + "metadata": {}, + "source": [ + "## Simplifying input\n", + "\n", + "To make invocation even simpler, we can add a `RunnableMap` to take care of creating the prompt input dict for us:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9601c0f0-71f9-4bd4-a672-7bd04084b018", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema.runnable import RunnableMap, RunnablePassthrough\n", + "\n", + "map_ = RunnableMap({\"foo\": RunnablePassthrough()})\n", + "chain = (\n", + " map_ \n", + " | prompt\n", + " | model.bind(function_call= {\"name\": \"joke\"}, functions= functions) \n", + " | JsonKeyOutputFunctionsParser(key_name=\"setup\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "7ec4f154-fda5-4847-9220-41aa902fdc33", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"Why don't bears wear shoes?\"" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke(\"bears\")" + ] + }, + { + "cell_type": "markdown", + "id": "def00bfe-0f83-4805-8c8f-8a53f99fa8ea", + "metadata": {}, + "source": [ + "Since we're composing our map with another Runnable, we can even use some syntactic sugar and just use a dict:" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "7bf3846a-02ee-41a3-ba1b-a708827d4f3a", + "metadata": {}, + "outputs": [], + "source": [ + "chain = (\n", + " {\"foo\": RunnablePassthrough()} \n", + " | prompt\n", + " | model.bind(function_call= {\"name\": \"joke\"}, functions= functions) \n", + " | JsonKeyOutputFunctionsParser(key_name=\"setup\")\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "e566d6a1-538d-4cb5-a210-a63e082e4c74", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "\"Why don't bears like fast food?\"" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke(\"bears\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/cookbook/retrieval.ipynb b/docs/extras/expression_language/cookbook/retrieval.ipynb new file mode 100644 index 0000000000..6579b1c7f3 --- /dev/null +++ b/docs/extras/expression_language/cookbook/retrieval.ipynb @@ -0,0 +1,461 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "abe47592-909c-4844-bf44-9e55c2fb4bfa", + "metadata": {}, + "source": [ + "---\n", + "sidebar_position: 1\n", + "title: RAG\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "91c5ef3d", + "metadata": {}, + "source": [ + "Let's look at adding in a retrieval step to a prompt and LLM, which adds up to a \"retrieval-augmented generation\" chain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7f25d9e9-d192-42e9-af50-5660a4bfb0d9", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install langchain openai faiss-cpu" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "33be32af", + "metadata": {}, + "outputs": [], + "source": [ + "from operator import itemgetter\n", + "\n", + "from langchain.prompts import ChatPromptTemplate\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.embeddings import OpenAIEmbeddings\n", + "from langchain.schema.output_parser import StrOutputParser\n", + "from langchain.schema.runnable import RunnablePassthrough\n", + "from langchain.vectorstores import FAISS" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bfc47ec1", + "metadata": {}, + "outputs": [], + "source": [ + "vectorstore = FAISS.from_texts([\"harrison worked at kensho\"], embedding=OpenAIEmbeddings())\n", + "retriever = vectorstore.as_retriever()\n", + "\n", + "template = \"\"\"Answer the question based only on the following context:\n", + "{context}\n", + "\n", + "Question: {question}\n", + "\"\"\"\n", + "prompt = ChatPromptTemplate.from_template(template)\n", + "\n", + "model = ChatOpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "eae31755", + "metadata": {}, + "outputs": [], + "source": [ + "chain = (\n", + " {\"context\": retriever, \"question\": RunnablePassthrough()} \n", + " | prompt \n", + " | model \n", + " | StrOutputParser()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "f3040b0c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Harrison worked at Kensho.'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke(\"where did harrison work?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "e1d20c7c", + "metadata": {}, + "outputs": [], + "source": [ + "template = \"\"\"Answer the question based only on the following context:\n", + "{context}\n", + "\n", + "Question: {question}\n", + "\n", + "Answer in the following language: {language}\n", + "\"\"\"\n", + "prompt = ChatPromptTemplate.from_template(template)\n", + "\n", + "chain = {\n", + " \"context\": itemgetter(\"question\") | retriever, \n", + " \"question\": itemgetter(\"question\"), \n", + " \"language\": itemgetter(\"language\")\n", + "} | prompt | model | StrOutputParser()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7ee8b2d4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Harrison ha lavorato a Kensho.'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"question\": \"where did harrison work\", \"language\": \"italian\"})" + ] + }, + { + "cell_type": "markdown", + "id": "f007669c", + "metadata": {}, + "source": [ + "## Conversational Retrieval Chain\n", + "\n", + "We can easily add in conversation history. This primarily means adding in chat_message_history" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "3f30c348", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema.runnable import RunnableMap\n", + "from langchain.schema import format_document" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "64ab1dbf", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.prompts.prompt import PromptTemplate\n", + "\n", + "_template = \"\"\"Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language.\n", + "\n", + "Chat History:\n", + "{chat_history}\n", + "Follow Up Input: {question}\n", + "Standalone question:\"\"\"\n", + "CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7d628c97", + "metadata": {}, + "outputs": [], + "source": [ + "template = \"\"\"Answer the question based only on the following context:\n", + "{context}\n", + "\n", + "Question: {question}\n", + "\"\"\"\n", + "ANSWER_PROMPT = ChatPromptTemplate.from_template(template)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f60a5d0f", + "metadata": {}, + "outputs": [], + "source": [ + "DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template=\"{page_content}\")\n", + "def _combine_documents(docs, document_prompt = DEFAULT_DOCUMENT_PROMPT, document_separator=\"\\n\\n\"):\n", + " doc_strings = [format_document(doc, document_prompt) for doc in docs]\n", + " return document_separator.join(doc_strings)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "7d007db6", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import Tuple, List\n", + "def _format_chat_history(chat_history: List[Tuple]) -> str:\n", + " buffer = \"\"\n", + " for dialogue_turn in chat_history:\n", + " human = \"Human: \" + dialogue_turn[0]\n", + " ai = \"Assistant: \" + dialogue_turn[1]\n", + " buffer += \"\\n\" + \"\\n\".join([human, ai])\n", + " return buffer" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "5c32cc89", + "metadata": {}, + "outputs": [], + "source": [ + "_inputs = RunnableMap(\n", + " {\n", + " \"standalone_question\": {\n", + " \"question\": lambda x: x[\"question\"],\n", + " \"chat_history\": lambda x: _format_chat_history(x['chat_history'])\n", + " } | CONDENSE_QUESTION_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),\n", + " }\n", + ")\n", + "_context = {\n", + " \"context\": itemgetter(\"standalone_question\") | retriever | _combine_documents,\n", + " \"question\": lambda x: x[\"standalone_question\"]\n", + "}\n", + "conversational_qa_chain = _inputs | _context | ANSWER_PROMPT | ChatOpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "135c8205", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conversational_qa_chain.invoke({\n", + " \"question\": \"where did harrison work?\",\n", + " \"chat_history\": [],\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "424e7e7a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='Harrison worked at Kensho.', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "conversational_qa_chain.invoke({\n", + " \"question\": \"where did he work?\",\n", + " \"chat_history\": [(\"Who wrote this notebook?\", \"Harrison\")],\n", + "})" + ] + }, + { + "cell_type": "markdown", + "id": "c5543183", + "metadata": {}, + "source": [ + "### With Memory and returning source documents\n", + "\n", + "This shows how to use memory with the above. For memory, we need to manage that outside at the memory. For returning the retrieved documents, we just need to pass them through all the way." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e31dd17c", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.memory import ConversationBufferMemory" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d4bffe94", + "metadata": {}, + "outputs": [], + "source": [ + "memory = ConversationBufferMemory(return_messages=True, output_key=\"answer\", input_key=\"question\")" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "733be985", + "metadata": {}, + "outputs": [], + "source": [ + "# First we add a step to load memory\n", + "# This needs to be a RunnableMap because its the first input\n", + "loaded_memory = RunnableMap(\n", + " {\n", + " \"question\": itemgetter(\"question\"),\n", + " \"memory\": memory.load_memory_variables,\n", + " }\n", + ")\n", + "# Next we add a step to expand memory into the variables\n", + "expanded_memory = {\n", + " \"question\": itemgetter(\"question\"),\n", + " \"chat_history\": lambda x: x[\"memory\"][\"history\"]\n", + "}\n", + "\n", + "# Now we calculate the standalone question\n", + "standalone_question = {\n", + " \"standalone_question\": {\n", + " \"question\": lambda x: x[\"question\"],\n", + " \"chat_history\": lambda x: _format_chat_history(x['chat_history'])\n", + " } | CONDENSE_QUESTION_PROMPT | ChatOpenAI(temperature=0) | StrOutputParser(),\n", + "}\n", + "# Now we retrieve the documents\n", + "retrieved_documents = {\n", + " \"docs\": itemgetter(\"standalone_question\") | retriever,\n", + " \"question\": lambda x: x[\"standalone_question\"]\n", + "}\n", + "# Now we construct the inputs for the final prompt\n", + "final_inputs = {\n", + " \"context\": lambda x: _combine_documents(x[\"docs\"]),\n", + " \"question\": itemgetter(\"question\")\n", + "}\n", + "# And finally, we do the part that returns the answers\n", + "answer = {\n", + " \"answer\": final_inputs | ANSWER_PROMPT | ChatOpenAI(),\n", + " \"docs\": itemgetter(\"docs\"),\n", + "}\n", + "# And now we put it all together!\n", + "final_chain = loaded_memory | expanded_memory | standalone_question | retrieved_documents | answer" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "806e390c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'answer': AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False),\n", + " 'docs': [Document(page_content='harrison worked at kensho', metadata={})]}" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inputs = {\"question\": \"where did harrison work?\"}\n", + "result = final_chain.invoke(inputs)\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "977399fd", + "metadata": {}, + "outputs": [], + "source": [ + "# Note that the memory does not save automatically\n", + "# This will be improved in the future\n", + "# For now you need to save it yourself\n", + "memory.save_context(inputs, {\"answer\": result[\"answer\"].content})" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "f94f7de4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'history': [HumanMessage(content='where did harrison work?', additional_kwargs={}, example=False),\n", + " AIMessage(content='Harrison was employed at Kensho.', additional_kwargs={}, example=False)]}" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "memory.load_memory_variables({})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "poetry-venv", + "language": "python", + "name": "poetry-venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/cookbook/sql_db.ipynb b/docs/extras/expression_language/cookbook/sql_db.ipynb new file mode 100644 index 0000000000..0cf0748009 --- /dev/null +++ b/docs/extras/expression_language/cookbook/sql_db.ipynb @@ -0,0 +1,227 @@ +{ + "cells": [ + { + "cell_type": "raw", + "id": "c14da114-1a4a-487d-9cff-e0e8c30ba366", + "metadata": {}, + "source": [ + "---\n", + "sidebar_position: 3\n", + "title: Querying a SQL DB\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "506e9636", + "metadata": {}, + "source": [ + "We can replicate our SQLDatabaseChain with Runnables." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "7a927516", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.prompts import ChatPromptTemplate\n", + "\n", + "template = \"\"\"Based on the table schema below, write a SQL query that would answer the user's question:\n", + "{schema}\n", + "\n", + "Question: {question}\n", + "SQL Query:\"\"\"\n", + "prompt = ChatPromptTemplate.from_template(template)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3f51f386", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.utilities import SQLDatabase" + ] + }, + { + "cell_type": "markdown", + "id": "7c3449d6-684b-416e-ba16-90a035835a88", + "metadata": {}, + "source": [ + "We'll need the Chinook sample DB for this example. There's many places to download it from, e.g. https://database.guide/2-sample-databases-sqlite/" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "2ccca6fc", + "metadata": {}, + "outputs": [], + "source": [ + "db = SQLDatabase.from_uri(\"sqlite:///./Chinook.db\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "05ba88ee", + "metadata": {}, + "outputs": [], + "source": [ + "def get_schema(_):\n", + " return db.get_table_info()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "a4eda902", + "metadata": {}, + "outputs": [], + "source": [ + "def run_query(query):\n", + " return db.run(query)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "5046cb17", + "metadata": {}, + "outputs": [], + "source": [ + "from operator import itemgetter\n", + "\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.schema.output_parser import StrOutputParser\n", + "from langchain.schema.runnable import RunnableLambda, RunnableMap\n", + "\n", + "model = ChatOpenAI()\n", + "\n", + "inputs = {\n", + " \"schema\": RunnableLambda(get_schema),\n", + " \"question\": itemgetter(\"question\")\n", + "}\n", + "sql_response = (\n", + " RunnableMap(inputs)\n", + " | prompt\n", + " | model.bind(stop=[\"\\nSQLResult:\"])\n", + " | StrOutputParser()\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "a5552039", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'SELECT COUNT(*) FROM Employee'" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sql_response.invoke({\"question\": \"How many employees are there?\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "d6fee130", + "metadata": {}, + "outputs": [], + "source": [ + "template = \"\"\"Based on the table schema below, question, sql query, and sql response, write a natural language response:\n", + "{schema}\n", + "\n", + "Question: {question}\n", + "SQL Query: {query}\n", + "SQL Response: {response}\"\"\"\n", + "prompt_response = ChatPromptTemplate.from_template(template)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "923aa634", + "metadata": {}, + "outputs": [], + "source": [ + "full_chain = (\n", + " RunnableMap({\n", + " \"question\": itemgetter(\"question\"),\n", + " \"query\": sql_response,\n", + " }) \n", + " | {\n", + " \"schema\": RunnableLambda(get_schema),\n", + " \"question\": itemgetter(\"question\"),\n", + " \"query\": itemgetter(\"query\"),\n", + " \"response\": lambda x: db.run(x[\"query\"]) \n", + " } \n", + " | prompt_response \n", + " | model\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "e94963d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='There are 8 employees.', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "full_chain.invoke({\"question\": \"How many employees are there?\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f358d7b-a721-4db3-9f92-f06913428afc", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/cookbook/tools.ipynb b/docs/extras/expression_language/cookbook/tools.ipynb new file mode 100644 index 0000000000..d13dece3c9 --- /dev/null +++ b/docs/extras/expression_language/cookbook/tools.ipynb @@ -0,0 +1,122 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "29781123", + "metadata": {}, + "source": [ + "# Using tools\n", + "\n", + "You can use any Tools with Runnables easily." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a5c579dd-2e22-41b0-a789-346dfdecb5a2", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install duckduckgo-search" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "9232d2a9", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.prompts import ChatPromptTemplate\n", + "from langchain.schema.output_parser import StrOutputParser\n", + "from langchain.tools import DuckDuckGoSearchRun" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a0c64d2c", + "metadata": {}, + "outputs": [], + "source": [ + "search = DuckDuckGoSearchRun()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "391969b6", + "metadata": {}, + "outputs": [], + "source": [ + "template = \"\"\"turn the following user input into a search query for a search engine:\n", + "\n", + "{input}\"\"\"\n", + "prompt = ChatPromptTemplate.from_template(template)\n", + "\n", + "model = ChatOpenAI()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "e3d9d20d", + "metadata": {}, + "outputs": [], + "source": [ + "chain = prompt | model | StrOutputParser() | search" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "55f2967d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'What sports games are on TV today & tonight? Watch and stream live sports on TV today, tonight, tomorrow. Today\\'s 2023 sports TV schedule includes football, basketball, baseball, hockey, motorsports, soccer and more. Watch on TV or stream online on ESPN, FOX, FS1, CBS, NBC, ABC, Peacock, Paramount+, fuboTV, local channels and many other networks. MLB Games Tonight: How to Watch on TV, Streaming & Odds - Thursday, September 7. Seattle Mariners\\' Julio Rodriguez greets teammates in the dugout after scoring against the Oakland Athletics in a ... Circle - Country Music and Lifestyle. Live coverage of all the MLB action today is available to you, with the information provided below. The Brewers will look to pick up a road win at PNC Park against the Pirates on Wednesday at 12:35 PM ET. Check out the latest odds and with BetMGM Sportsbook. Use bonus code \"GNPLAY\" for special offers! MLB Games Tonight: How to Watch on TV, Streaming & Odds - Tuesday, September 5. Houston Astros\\' Kyle Tucker runs after hitting a double during the fourth inning of a baseball game against the Los Angeles Angels, Sunday, Aug. 13, 2023, in Houston. (AP Photo/Eric Christian Smith) (APMedia) The Houston Astros versus the Texas Rangers is one of ... The second half of tonight\\'s college football schedule still has some good games remaining to watch on your television.. We\\'ve already seen an exciting one when Colorado upset TCU. And we saw some ...'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"input\": \"I'd like to figure out what games are tonight\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a16949cf-00ea-43c6-a6aa-797ad4f6918d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "poetry-venv", + "language": "python", + "name": "poetry-venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/how_to/_category_.yml b/docs/extras/expression_language/how_to/_category_.yml new file mode 100644 index 0000000000..39fa22bfbf --- /dev/null +++ b/docs/extras/expression_language/how_to/_category_.yml @@ -0,0 +1,2 @@ +label: 'How to' +position: 1 \ No newline at end of file diff --git a/docs/extras/expression_language/how_to/functions.ipynb b/docs/extras/expression_language/how_to/functions.ipynb new file mode 100644 index 0000000000..fc2f0a2962 --- /dev/null +++ b/docs/extras/expression_language/how_to/functions.ipynb @@ -0,0 +1,158 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fbc4bf6e", + "metadata": {}, + "source": [ + "# Run arbitrary functions\n", + "\n", + "You can use arbitrary functions in the pipeline\n", + "\n", + "Note that all inputs to these functions need to be a SINGLE argument. If you have a function that accepts multiple arguments, you should write a wrapper that accepts a single input and unpacks it into multiple argument." + ] + }, + { + "cell_type": "code", + "execution_count": 77, + "id": "6bb221b3", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema.runnable import RunnableLambda\n", + "\n", + "def length_function(text):\n", + " return len(text)\n", + "\n", + "def _multiple_length_function(text1, text2):\n", + " return len(text1) * len(text2)\n", + "\n", + "def multiple_length_function(_dict):\n", + " return _multiple_length_function(_dict[\"text1\"], _dict[\"text2\"])\n", + "\n", + "prompt = ChatPromptTemplate.from_template(\"what is {a} + {b}\")\n", + "\n", + "chain1 = prompt | model\n", + "\n", + "chain = {\n", + " \"a\": itemgetter(\"foo\") | RunnableLambda(length_function),\n", + " \"b\": {\"text1\": itemgetter(\"foo\"), \"text2\": itemgetter(\"bar\")} | RunnableLambda(multiple_length_function)\n", + "} | prompt | model" + ] + }, + { + "cell_type": "code", + "execution_count": 78, + "id": "5488ec85", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content='3 + 9 equals 12.', additional_kwargs={}, example=False)" + ] + }, + "execution_count": 78, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.invoke({\"foo\": \"bar\", \"bar\": \"gah\"})" + ] + }, + { + "cell_type": "markdown", + "id": "4728ddd9-914d-42ce-ae9b-72c9ce8ec940", + "metadata": {}, + "source": [ + "## Accepting a Runnable Config\n", + "\n", + "Runnable lambdas can optionally accept a [RunnableConfig](https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.config.RunnableConfig.html?highlight=runnableconfig#langchain.schema.runnable.config.RunnableConfig), which they can use to pass callbacks, tags, and other configuration information to nested runs." + ] + }, + { + "cell_type": "code", + "execution_count": 139, + "id": "80b3b5f6-5d58-44b9-807e-cce9a46bf49f", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.schema.runnable import RunnableConfig" + ] + }, + { + "cell_type": "code", + "execution_count": 149, + "id": "ff0daf0c-49dd-4d21-9772-e5fa133c5f36", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "\n", + "def parse_or_fix(text: str, config: RunnableConfig):\n", + " fixing_chain = (\n", + " ChatPromptTemplate.from_template(\n", + " \"Fix the following text:\\n\\n```text\\n{input}\\n```\\nError: {error}\"\n", + " \" Don't narrate, just respond with the fixed data.\"\n", + " )\n", + " | ChatOpenAI()\n", + " | StrOutputParser()\n", + " )\n", + " for _ in range(3):\n", + " try:\n", + " return json.loads(text)\n", + " except Exception as e:\n", + " text = fixing_chain.invoke({\"input\": text, \"error\": e}, config)\n", + " return \"Failed to parse\"" + ] + }, + { + "cell_type": "code", + "execution_count": 152, + "id": "1a5e709e-9d75-48c7-bb9c-503251990505", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tokens Used: 65\n", + "\tPrompt Tokens: 56\n", + "\tCompletion Tokens: 9\n", + "Successful Requests: 1\n", + "Total Cost (USD): $0.00010200000000000001\n" + ] + } + ], + "source": [ + "from langchain.callbacks import get_openai_callback\n", + "\n", + "with get_openai_callback() as cb:\n", + " RunnableLambda(parse_or_fix).invoke(\"{foo: bar}\", {\"tags\": [\"my-tag\"], \"callbacks\": [cb]})\n", + " print(cb)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/expression_language/interface.ipynb b/docs/extras/expression_language/interface.ipynb index cf19bfe4db..c47800ecad 100644 --- a/docs/extras/expression_language/interface.ipynb +++ b/docs/extras/expression_language/interface.ipynb @@ -1,12 +1,21 @@ { "cells": [ + { + "cell_type": "raw", + "id": "366a0e68-fd67-4fe5-a292-5c33733339ea", + "metadata": {}, + "source": [ + "---\n", + "sidebar_position: 0\n", + "title: Interface\n", + "---" + ] + }, { "cell_type": "markdown", "id": "9a9acd2e", "metadata": {}, "source": [ - "# Interface\n", - "\n", "In an effort to make it as easy as possible to create custom chains, we've implemented a [\"Runnable\"](https://api.python.langchain.com/en/latest/schema/langchain.schema.runnable.Runnable.html#langchain.schema.runnable.Runnable) protocol that most components implement. This is a standard interface with a few different methods, which makes it easy to define custom chains as well as making it possible to invoke them in a standard way. The standard interface exposed includes:\n", "\n", "- `stream`: stream back chunks of the response\n", @@ -429,7 +438,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.1" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/extras/guides/privacy/presidio_data_anonymization.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb similarity index 85% rename from docs/extras/guides/privacy/presidio_data_anonymization.ipynb rename to docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb index faa9929259..2502a45092 100644 --- a/docs/extras/guides/privacy/presidio_data_anonymization.ipynb +++ b/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb @@ -6,7 +6,7 @@ "source": [ "# Data anonymization with Microsoft Presidio\n", "\n", - "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/presidio_data_anonymization.ipynb)\n", + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/presidio_data_anonymization/index.ipynb)\n", "\n", "## Use case\n", "\n", @@ -28,7 +28,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -47,16 +47,16 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'My name is Mrs. Rachel Chen DDS, call me at 849-829-7628x073 or email me at christopherfrey@example.org'" + "'My name is Laura Ruiz, call me at +1-412-982-8374x13414 or email me at javierwatkins@example.net'" ] }, - "execution_count": 14, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -94,35 +94,53 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "text = f\"\"\"Slim Shady recently lost his wallet. \n", + "Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n", + "If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 5, "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "AIMessage(content='You can find our super secret data at https://www.ross.com/', additional_kwargs={}, example=False)" - ] - }, - "execution_count": 16, - "metadata": {}, - "output_type": "execute_result" + "name": "stdout", + "output_type": "stream", + "text": [ + "Dear Sir/Madam,\n", + "\n", + "We regret to inform you that Richard Fields has recently misplaced his wallet, which contains a sum of cash and his credit card bearing the number 30479847307774. \n", + "\n", + "Should you happen to come across it, we kindly request that you contact us immediately at 6439182672 or via email at frank45@example.com.\n", + "\n", + "Thank you for your attention to this matter.\n", + "\n", + "Yours faithfully,\n", + "\n", + "[Your Name]\n" + ] } ], "source": [ "from langchain.prompts.prompt import PromptTemplate\n", "from langchain.chat_models import ChatOpenAI\n", - "from langchain.schema.runnable import RunnablePassthrough\n", "\n", - "template = \"\"\"According to this text, where can you find our super secret data?\n", + "anonymizer = PresidioAnonymizer()\n", "\n", - "{anonymized_text}\n", + "template = \"\"\"Rewrite this text into an official, short email:\n", "\n", - "Answer:\"\"\"\n", + "{anonymized_text}\"\"\"\n", "prompt = PromptTemplate.from_template(template)\n", - "llm = ChatOpenAI()\n", + "llm = ChatOpenAI(temperature=0)\n", "\n", "chain = {\"anonymized_text\": anonymizer.anonymize} | prompt | llm\n", - "chain.invoke(\"You can find our super secret data at https://supersecretdata.com\")" + "response = chain.invoke(text)\n", + "print(response.content)" ] }, { @@ -135,16 +153,16 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'My name is Gabrielle Edwards, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'" + "'My name is Adrian Fleming, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'" ] }, - "execution_count": 18, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -166,16 +184,16 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'My name is Victoria Mckinney, call me at 713-549-8623 or email me at real.slim.shady@gmail.com'" + "'My name is Justin Miller, call me at 761-824-1889 or email me at real.slim.shady@gmail.com'" ] }, - "execution_count": 3, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -201,16 +219,16 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'My name is Billy Russo, call me at 970-996-9453x038 or email me at jamie80@example.org'" + "'My name is Dr. Jennifer Baker, call me at (508)839-9329x232 or email me at ehamilton@example.com'" ] }, - "execution_count": 4, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -232,16 +250,16 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'My polish phone number is EVIA70648911396944'" + "'My polish phone number is NRGN41434238921378'" ] }, - "execution_count": 5, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -261,7 +279,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -291,7 +309,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -308,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -337,16 +355,16 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'+48 533 220 543'" + "'511 622 683'" ] }, - "execution_count": 9, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -374,7 +392,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -389,7 +407,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -398,16 +416,16 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'My polish phone number is +48 692 715 636'" + "'My polish phone number is +48 734 630 977'" ] }, - "execution_count": 12, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -421,8 +439,6 @@ "metadata": {}, "source": [ "## Future works\n", - "\n", - "- **deanonymization** - add the ability to reverse anonymization. For example, the workflow could look like this: `anonymize -> LLMChain -> deanonymize`. By doing this, we will retain anonymity in requests to, for example, OpenAI, and then be able restore the original data.\n", "- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object." ] } diff --git a/docs/extras/guides/privacy/presidio_data_anonymization/multi_language.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization/multi_language.ipynb new file mode 100644 index 0000000000..63ba8931a6 --- /dev/null +++ b/docs/extras/guides/privacy/presidio_data_anonymization/multi_language.ipynb @@ -0,0 +1,520 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Mutli-language data anonymization with Microsoft Presidio\n", + "\n", + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/presidio_data_anonymization/multi_language.ipynb)\n", + "\n", + "\n", + "## Use case\n", + "\n", + "Multi-language support in data pseudonymization is essential due to differences in language structures and cultural contexts. Different languages may have varying formats for personal identifiers. For example, the structure of names, locations and dates can differ greatly between languages and regions. Furthermore, non-alphanumeric characters, accents, and the direction of writing can impact pseudonymization processes. Without multi-language support, data could remain identifiable or be misinterpreted, compromising data privacy and accuracy. Hence, it enables effective and precise pseudonymization suited for global operations.\n", + "\n", + "## Overview\n", + "\n", + "PII detection in Microsoft Presidio relies on several components - in addition to the usual pattern matching (e.g. using regex), the analyser uses a model for Named Entity Recognition (NER) to extract entities such as:\n", + "- `PERSON`\n", + "- `LOCATION`\n", + "- `DATE_TIME`\n", + "- `NRP`\n", + "- `ORGANIZATION`\n", + "\n", + "[[Source]](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py)\n", + "\n", + "To handle NER in specific languages, we utilize unique models from the `spaCy` library, recognized for its extensive selection covering multiple languages and sizes. However, it's not restrictive, allowing for integration of alternative frameworks such as [Stanza](https://microsoft.github.io/presidio/analyzer/nlp_engines/spacy_stanza/) or [transformers](https://microsoft.github.io/presidio/analyzer/nlp_engines/transformers/) when necessary.\n", + "\n", + "\n", + "## Quickstart\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Install necessary packages\n", + "# ! pip install langchain langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker\n", + "# ! python -m spacy download en_core_web_lg" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n", + "\n", + "anonymizer = PresidioReversibleAnonymizer(\n", + " analyzed_fields=[\"PERSON\"],\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, `PresidioAnonymizer` and `PresidioReversibleAnonymizer` use a model trained on English texts, so they handle other languages moderately well. \n", + "\n", + "For example, here the model did not detect the person:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Me llamo Sofía'" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymizer.anonymize(\"Me llamo Sofía\") # \"My name is Sofía\" in Spanish" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "They may also take words from another language as actual entities. Here, both the word *'Yo'* (*'I'* in Spanish) and *Sofía* have been classified as `PERSON`:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'Bridget Kirk soy Sally Knight'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymizer.anonymize(\"Yo soy Sofía\") # \"I am Sofía\" in Spanish" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "If you want to anonymise texts from other languages, you need to download other models and add them to the anonymiser configuration:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Download the models for the languages you want to use\n", + "# ! python -m spacy download en_core_web_md\n", + "# ! python -m spacy download es_core_news_md" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "nlp_config = {\n", + " \"nlp_engine_name\": \"spacy\",\n", + " \"models\": [\n", + " {\"lang_code\": \"en\", \"model_name\": \"en_core_web_md\"},\n", + " {\"lang_code\": \"es\", \"model_name\": \"es_core_news_md\"},\n", + " ],\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We have therefore added a Spanish language model. Note also that we have downloaded an alternative model for English as well - in this case we have replaced the large model `en_core_web_lg` (560MB) with its smaller version `en_core_web_md` (40MB) - the size is therefore reduced by 14 times! If you care about the speed of anonymisation, it is worth considering it.\n", + "\n", + "All models for the different languages can be found in the [spaCy documentation](https://spacy.io/usage/models).\n", + "\n", + "Now pass the configuration as the `languages_config` parameter to Anonymiser. As you can see, both previous examples work flawlessly:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Me llamo Michelle Smith\n", + "Yo soy Rachel Wright\n" + ] + } + ], + "source": [ + "anonymizer = PresidioReversibleAnonymizer(\n", + " analyzed_fields=[\"PERSON\"],\n", + " languages_config=nlp_config,\n", + ")\n", + "\n", + "print(\n", + " anonymizer.anonymize(\"Me llamo Sofía\", language=\"es\")\n", + ") # \"My name is Sofía\" in Spanish\n", + "print(anonymizer.anonymize(\"Yo soy Sofía\", language=\"es\")) # \"I am Sofía\" in Spanish" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "By default, the language indicated first in the configuration will be used when anonymising text (in this case English):" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "My name is Ronnie Ayala\n" + ] + } + ], + "source": [ + "print(anonymizer.anonymize(\"My name is John\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Advanced usage\n", + "\n", + "### Custom labels in NER model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "It may be that the spaCy model has different class names than those supported by the Microsoft Presidio by default. Take Polish, for example:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Text: Wiktoria, Start: 12, End: 20, Label: persName\n" + ] + } + ], + "source": [ + "# ! python -m spacy download pl_core_news_md\n", + "\n", + "import spacy\n", + "\n", + "nlp = spacy.load(\"pl_core_news_md\")\n", + "doc = nlp(\"Nazywam się Wiktoria\") # \"My name is Wiktoria\" in Polish\n", + "\n", + "for ent in doc.ents:\n", + " print(\n", + " f\"Text: {ent.text}, Start: {ent.start_char}, End: {ent.end_char}, Label: {ent.label_}\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The name *Victoria* was classified as `persName`, which does not correspond to the default class names `PERSON`/`PER` implemented in Microsoft Presidio (look for `CHECK_LABEL_GROUPS` in [SpacyRecognizer implementation](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/predefined_recognizers/spacy_recognizer.py)). \n", + "\n", + "You can find out more about custom labels in spaCy models (including your own, trained ones) in [this thread](https://github.com/microsoft/presidio/issues/851).\n", + "\n", + "That's why our sentence will not be anonymized:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nazywam się Wiktoria\n" + ] + } + ], + "source": [ + "nlp_config = {\n", + " \"nlp_engine_name\": \"spacy\",\n", + " \"models\": [\n", + " {\"lang_code\": \"en\", \"model_name\": \"en_core_web_md\"},\n", + " {\"lang_code\": \"es\", \"model_name\": \"es_core_news_md\"},\n", + " {\"lang_code\": \"pl\", \"model_name\": \"pl_core_news_md\"},\n", + " ],\n", + "}\n", + "\n", + "anonymizer = PresidioReversibleAnonymizer(\n", + " analyzed_fields=[\"PERSON\", \"LOCATION\", \"DATE_TIME\"],\n", + " languages_config=nlp_config,\n", + ")\n", + "\n", + "print(\n", + " anonymizer.anonymize(\"Nazywam się Wiktoria\", language=\"pl\")\n", + ") # \"My name is Wiktoria\" in Polish" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To address this, create your own `SpacyRecognizer` with your own class mapping and add it to the anonymizer:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "from presidio_analyzer.predefined_recognizers import SpacyRecognizer\n", + "\n", + "polish_check_label_groups = [\n", + " ({\"LOCATION\"}, {\"placeName\", \"geogName\"}),\n", + " ({\"PERSON\"}, {\"persName\"}),\n", + " ({\"DATE_TIME\"}, {\"date\", \"time\"}),\n", + "]\n", + "\n", + "spacy_recognizer = SpacyRecognizer(\n", + " supported_language=\"pl\",\n", + " check_label_groups=polish_check_label_groups,\n", + ")\n", + "\n", + "anonymizer.add_recognizer(spacy_recognizer)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now everything works smoothly:" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nazywam się Morgan Walters\n" + ] + } + ], + "source": [ + "print(\n", + " anonymizer.anonymize(\"Nazywam się Wiktoria\", language=\"pl\")\n", + ") # \"My name is Wiktoria\" in Polish" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's try on more complex example:" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nazywam się Ernest Liu. New Taylorburgh to moje miasto rodzinne. Urodziłam się 1987-01-19\n" + ] + } + ], + "source": [ + "print(\n", + " anonymizer.anonymize(\n", + " \"Nazywam się Wiktoria. Płock to moje miasto rodzinne. Urodziłam się dnia 6 kwietnia 2001 roku\",\n", + " language=\"pl\",\n", + " )\n", + ") # \"My name is Wiktoria. Płock is my home town. I was born on 6 April 2001\" in Polish" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As you can see, thanks to class mapping, the anonymiser can cope with different types of entities. " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Custom language-specific operators\n", + "\n", + "In the example above, the sentence has been anonymised correctly, but the fake data does not fit the Polish language at all. Custom operators can therefore be added, which will resolve the issue:" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from faker import Faker\n", + "from presidio_anonymizer.entities import OperatorConfig\n", + "\n", + "fake = Faker(locale=\"pl_PL\") # Setting faker to provide Polish data\n", + "\n", + "new_operators = {\n", + " \"PERSON\": OperatorConfig(\"custom\", {\"lambda\": lambda _: fake.first_name_female()}),\n", + " \"LOCATION\": OperatorConfig(\"custom\", {\"lambda\": lambda _: fake.city()}),\n", + "}\n", + "\n", + "anonymizer.add_operators(new_operators)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Nazywam się Marianna. Szczecin to moje miasto rodzinne. Urodziłam się 1976-11-16\n" + ] + } + ], + "source": [ + "print(\n", + " anonymizer.anonymize(\n", + " \"Nazywam się Wiktoria. Płock to moje miasto rodzinne. Urodziłam się dnia 6 kwietnia 2001 roku\",\n", + " language=\"pl\",\n", + " )\n", + ") # \"My name is Wiktoria. Płock is my home town. I was born on 6 April 2001\" in Polish" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Limitations\n", + "\n", + "Remember - results are as good as your recognizers and as your NER models!\n", + "\n", + "Look at the example below - we downloaded the small model for Spanish (12MB) and it no longer performs as well as the medium version (40MB):" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Model: es_core_news_sm. Result: Me llamo Sofía\n", + "Model: es_core_news_md. Result: Me llamo Lawrence Davis\n" + ] + } + ], + "source": [ + "# ! python -m spacy download es_core_news_sm\n", + "\n", + "for model in [\"es_core_news_sm\", \"es_core_news_md\"]:\n", + " nlp_config = {\n", + " \"nlp_engine_name\": \"spacy\",\n", + " \"models\": [\n", + " {\"lang_code\": \"es\", \"model_name\": model},\n", + " ],\n", + " }\n", + "\n", + " anonymizer = PresidioReversibleAnonymizer(\n", + " analyzed_fields=[\"PERSON\"],\n", + " languages_config=nlp_config,\n", + " )\n", + "\n", + " print(\n", + " f\"Model: {model}. Result: {anonymizer.anonymize('Me llamo Sofía', language='es')}\"\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In many cases, even the larger models from spaCy will not be sufficient - there are already other, more complex and better methods of detecting named entities, based on transformers. You can read more about this [here](https://microsoft.github.io/presidio/analyzer/nlp_engines/transformers/)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Future works\n", + "\n", + "- **automatic language detection** - instead of passing the language as a parameter in `anonymizer.anonymize`, we could detect the language/s beforehand and then use the corresponding NER model." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb b/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb new file mode 100644 index 0000000000..de5655ba1e --- /dev/null +++ b/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb @@ -0,0 +1,461 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Reversible data anonymization with Microsoft Presidio\n", + "\n", + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/presidio_data_anonymization/reversible.ipynb)\n", + "\n", + "\n", + "## Use case\n", + "\n", + "We have already written about the importance of anonymizing sensitive data in the previous section. **Reversible Anonymization** is an equally essential technology while sharing information with language models, as it balances data protection with data usability. This technique involves masking sensitive personally identifiable information (PII), yet it can be reversed and original data can be restored when authorized users need it. Its main advantage lies in the fact that while it conceals individual identities to prevent misuse, it also allows the concealed data to be accurately unmasked should it be necessary for legal or compliance purposes. \n", + "\n", + "## Overview\n", + "\n", + "We implemented the `PresidioReversibleAnonymizer`, which consists of two parts:\n", + "\n", + "1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example:\n", + "```\n", + " {\n", + " \"PERSON\": {\n", + " \"\": \"\",\n", + " \"John Doe\": \"Slim Shady\"\n", + " },\n", + " \"PHONE_NUMBER\": {\n", + " \"111-111-1111\": \"555-555-5555\"\n", + " }\n", + " ...\n", + " }\n", + "```\n", + "\n", + "2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it.\n", + "\n", + "Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM.\n", + "\n", + "## Quickstart\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Install necessary packages\n", + "# ! pip install langchain langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker\n", + "# ! python -m spacy download en_core_web_lg" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`PresidioReversibleAnonymizer` is not significantly different from its predecessor (`PresidioAnonymizer`) in terms of anonymization:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'My name is Maria Lynch, call me at 7344131647 or email me at jamesmichael@example.com. By the way, my card number is: 4838637940262'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n", + "\n", + "anonymizer = PresidioReversibleAnonymizer(\n", + " analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n", + " # Faker seed is used here to make sure the same fake data is generated for the test purposes\n", + " # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n", + " faker_seed=42,\n", + ")\n", + "\n", + "anonymizer.anonymize(\n", + " \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n", + " \"By the way, my card number is: 4916 0387 9536 0861\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This is what the full string we want to deanonymize looks like:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Maria Lynch recently lost his wallet. \n", + "Inside is some cash and his credit card with the number 4838637940262. \n", + "If you would find it, please call at 7344131647 or write an email here: jamesmichael@example.com.\n", + "Maria Lynch would be very grateful!\n" + ] + } + ], + "source": [ + "# We know this data, as we set the faker_seed parameter\n", + "fake_name = \"Maria Lynch\"\n", + "fake_phone = \"7344131647\"\n", + "fake_email = \"jamesmichael@example.com\"\n", + "fake_credit_card = \"4838637940262\"\n", + "\n", + "anonymized_text = f\"\"\"{fake_name} recently lost his wallet. \n", + "Inside is some cash and his credit card with the number {fake_credit_card}. \n", + "If you would find it, please call at {fake_phone} or write an email here: {fake_email}.\n", + "{fake_name} would be very grateful!\"\"\"\n", + "\n", + "print(anonymized_text)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And now, using the `deanonymize` method, we can reverse the process:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Slim Shady recently lost his wallet. \n", + "Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n", + "If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\n", + "Slim Shady would be very grateful!\n" + ] + } + ], + "source": [ + "print(anonymizer.deanonymize(anonymized_text))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Using with LangChain Expression Language\n", + "\n", + "With LCEL we can easily chain together anonymization and deanonymization with the rest of our application. This is an example of using the anonymization mechanism with a query to LLM (without deanonymization for now):" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "text = f\"\"\"Slim Shady recently lost his wallet. \n", + "Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n", + "If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dear Sir/Madam,\n", + "\n", + "We regret to inform you that Mr. Dana Rhodes has reported the loss of his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4397528473885757. \n", + "\n", + "If you happen to come across the aforementioned wallet, we kindly request that you contact us immediately at 258-481-7074x714 or via email at laurengoodman@example.com.\n", + "\n", + "Your prompt assistance in this matter would be greatly appreciated.\n", + "\n", + "Yours faithfully,\n", + "\n", + "[Your Name]\n" + ] + } + ], + "source": [ + "from langchain.prompts.prompt import PromptTemplate\n", + "from langchain.chat_models import ChatOpenAI\n", + "\n", + "anonymizer = PresidioReversibleAnonymizer()\n", + "\n", + "template = \"\"\"Rewrite this text into an official, short email:\n", + "\n", + "{anonymized_text}\"\"\"\n", + "prompt = PromptTemplate.from_template(template)\n", + "llm = ChatOpenAI(temperature=0)\n", + "\n", + "chain = {\"anonymized_text\": anonymizer.anonymize} | prompt | llm\n", + "response = chain.invoke(text)\n", + "print(response.content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, let's add **deanonymization step** to our sequence:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Dear Sir/Madam,\n", + "\n", + "We regret to inform you that Mr. Slim Shady has recently misplaced his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4916 0387 9536 0861. \n", + "\n", + "If by any chance you come across the lost wallet, kindly contact us immediately at 313-666-7440 or send an email to real.slim.shady@gmail.com.\n", + "\n", + "Your prompt assistance in this matter would be greatly appreciated.\n", + "\n", + "Yours faithfully,\n", + "\n", + "[Your Name]\n" + ] + } + ], + "source": [ + "chain = chain | (lambda ai_message: anonymizer.deanonymize(ai_message.content))\n", + "response = chain.invoke(text)\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Anonymized data was given to the model itself, and therefore it was protected from being leaked to the outside world. Then, the model's response was processed, and the factual value was replaced with the real one." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Extra knowledge" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`PresidioReversibleAnonymizer` stores the mapping of the fake values to the original values in the `deanonymizer_mapping` parameter, where key is fake PII and value is the original one: " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'PERSON': {'Maria Lynch': 'Slim Shady'},\n", + " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", + " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", + " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861'}}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n", + "\n", + "anonymizer = PresidioReversibleAnonymizer(\n", + " analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n", + " # Faker seed is used here to make sure the same fake data is generated for the test purposes\n", + " # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n", + " faker_seed=42,\n", + ")\n", + "\n", + "anonymizer.anonymize(\n", + " \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n", + " \"By the way, my card number is: 4916 0387 9536 0861\"\n", + ")\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Anonymizing more texts will result in new mapping entries:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Do you have his VISA card number? Yep, it's 3537672423884966. I'm William Bowman by the way.\n" + ] + }, + { + "data": { + "text/plain": [ + "{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n", + " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", + " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", + " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n", + " '3537672423884966': '4001 9192 5753 7193'}}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(\n", + " anonymizer.anonymize(\n", + " \"Do you have his VISA card number? Yep, it's 4001 9192 5753 7193. I'm John Doe by the way.\"\n", + " )\n", + ")\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can save the mapping itself to a file for future use: " + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# We can save the deanonymizer mapping as a JSON or YAML file\n", + "\n", + "anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n", + "# anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.yaml\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then, load it in another `PresidioReversibleAnonymizer` instance:" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{}" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymizer = PresidioReversibleAnonymizer()\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n", + " 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n", + " 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n", + " 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n", + " '3537672423884966': '4001 9192 5753 7193'}}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "anonymizer.load_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n", + "\n", + "anonymizer.deanonymizer_mapping" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Future works\n", + "\n", + "- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object.\n", + "- **better matching and substitution of fake values for real ones** - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. *John Doe* -> *John* or *Main St, New York* -> *New York*) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/extras/guides/safety/amazon_comprehend_chain.ipynb b/docs/extras/guides/safety/amazon_comprehend_chain.ipynb index e7e1961d42..69117b8257 100644 --- a/docs/extras/guides/safety/amazon_comprehend_chain.ipynb +++ b/docs/extras/guides/safety/amazon_comprehend_chain.ipynb @@ -512,9 +512,9 @@ "# Examples\n", "---\n", "\n", - "## With HuggingFace Hub Models\n", + "## With Hugging Face Hub Models\n", "\n", - "Get your API Key from Huggingface hub - https://huggingface.co/docs/api-inference/quicktour#get-your-api-token" + "Get your API Key from Hugging Face hub - https://huggingface.co/docs/api-inference/quicktour#get-your-api-token" ] }, { diff --git a/docs/extras/integrations/callbacks/confident.ipynb b/docs/extras/integrations/callbacks/confident.ipynb new file mode 100644 index 0000000000..ca4c9ae062 --- /dev/null +++ b/docs/extras/integrations/callbacks/confident.ipynb @@ -0,0 +1,310 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Confident\n", + "\n", + ">[DeepEval](https://confident-ai.com) package for unit testing LLMs.\n", + "> Using Confident, everyone can build robust language models through faster iterations\n", + "> using both unit testing and integration testing. We provide support for each step in the iteration\n", + "> from synthetic data creation to testing.\n" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this guide we will demonstrate how to test and measure LLMs in performance. We show how you can use our callback to measure performance and how you can define your own metric and log them into our dashboard.\n", + "\n", + "DeepEval also offers:\n", + "- How to generate synthetic data\n", + "- How to measure performance\n", + "- A dashboard to monitor and review results over time" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "## Installation and Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install deepeval --upgrade" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Getting API Credentials\n", + "\n", + "To get the DeepEval API credentials, follow the next steps:\n", + "\n", + "1. Go to https://app.confident-ai.com\n", + "2. Click on \"Organization\"\n", + "3. Copy the API Key.\n", + "\n", + "\n", + "When you log in, you will also be asked to set the `implementation` name. The implementation name is required to describe the type of implementation. (Think of what you want to call your project. We recommend making it descriptive.)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "!deepeval login" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup DeepEval\n", + "\n", + "You can, by default, use the `DeepEvalCallbackHandler` to set up the metrics you want to track. However, this has limited support for metrics at the moment (more to be added soon). It currently supports:\n", + "- [Answer Relevancy](https://docs.confident-ai.com/docs/measuring_llm_performance/answer_relevancy)\n", + "- [Bias](https://docs.confident-ai.com/docs/measuring_llm_performance/debias)\n", + "- [Toxicness](https://docs.confident-ai.com/docs/measuring_llm_performance/non_toxic)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from deepeval.metrics.answer_relevancy import AnswerRelevancy\n", + "\n", + "# Here we want to make sure the answer is minimally relevant\n", + "answer_relevancy_metric = AnswerRelevancy(minimum_score=0.5)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Get Started" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use the `DeepEvalCallbackHandler`, we need the `implementation_name`. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from langchain.callbacks.confident_callback import DeepEvalCallbackHandler\n", + "\n", + "deepeval_callback = DeepEvalCallbackHandler(\n", + " implementation_name=\"langchainQuickstart\",\n", + " metrics=[answer_relevancy_metric]\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Scenario 1: Feeding into LLM\n", + "\n", + "You can then feed it into your LLM with OpenAI." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "LLMResult(generations=[[Generation(text='\\n\\nQ: What did the fish say when he hit the wall? \\nA: Dam.', generation_info={'finish_reason': 'stop', 'logprobs': None})], [Generation(text='\\n\\nThe Moon \\n\\nThe moon is high in the midnight sky,\\nSparkling like a star above.\\nThe night so peaceful, so serene,\\nFilling up the air with love.\\n\\nEver changing and renewing,\\nA never-ending light of grace.\\nThe moon remains a constant view,\\nA reminder of life’s gentle pace.\\n\\nThrough time and space it guides us on,\\nA never-fading beacon of hope.\\nThe moon shines down on us all,\\nAs it continues to rise and elope.', generation_info={'finish_reason': 'stop', 'logprobs': None})], [Generation(text='\\n\\nQ. What did one magnet say to the other magnet?\\nA. \"I find you very attractive!\"', generation_info={'finish_reason': 'stop', 'logprobs': None})], [Generation(text=\"\\n\\nThe world is charged with the grandeur of God.\\nIt will flame out, like shining from shook foil;\\nIt gathers to a greatness, like the ooze of oil\\nCrushed. Why do men then now not reck his rod?\\n\\nGenerations have trod, have trod, have trod;\\nAnd all is seared with trade; bleared, smeared with toil;\\nAnd wears man's smudge and shares man's smell: the soil\\nIs bare now, nor can foot feel, being shod.\\n\\nAnd for all this, nature is never spent;\\nThere lives the dearest freshness deep down things;\\nAnd though the last lights off the black West went\\nOh, morning, at the brown brink eastward, springs —\\n\\nBecause the Holy Ghost over the bent\\nWorld broods with warm breast and with ah! bright wings.\\n\\n~Gerard Manley Hopkins\", generation_info={'finish_reason': 'stop', 'logprobs': None})], [Generation(text='\\n\\nQ: What did one ocean say to the other ocean?\\nA: Nothing, they just waved.', generation_info={'finish_reason': 'stop', 'logprobs': None})], [Generation(text=\"\\n\\nA poem for you\\n\\nOn a field of green\\n\\nThe sky so blue\\n\\nA gentle breeze, the sun above\\n\\nA beautiful world, for us to love\\n\\nLife is a journey, full of surprise\\n\\nFull of joy and full of surprise\\n\\nBe brave and take small steps\\n\\nThe future will be revealed with depth\\n\\nIn the morning, when dawn arrives\\n\\nA fresh start, no reason to hide\\n\\nSomewhere down the road, there's a heart that beats\\n\\nBelieve in yourself, you'll always succeed.\", generation_info={'finish_reason': 'stop', 'logprobs': None})]], llm_output={'token_usage': {'completion_tokens': 504, 'total_tokens': 528, 'prompt_tokens': 24}, 'model_name': 'text-davinci-003'})" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain.llms import OpenAI\n", + "llm = OpenAI(\n", + " temperature=0,\n", + " callbacks=[deepeval_callback],\n", + " verbose=True,\n", + " openai_api_key=\"\",\n", + ")\n", + "output = llm.generate(\n", + " [\n", + " \"What is the best evaluation tool out there? (no bias at all)\",\n", + " ]\n", + ")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can then check the metric if it was successful by calling the `is_successful()` method." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "answer_relevancy_metric.is_successful()\n", + "# returns True/False" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Once you have ran that, you should be able to see our dashboard below. \n", + "\n", + "![Dashboard](https://docs.confident-ai.com/assets/images/dashboard-screenshot-b02db73008213a211b1158ff052d969e.png)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Scenario 2: Tracking an LLM in a chain without callbacks\n", + "\n", + "To track an LLM in a chain without callbacks, you can plug into it at the end.\n", + "\n", + "We can start by defining a simple chain as shown below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import requests\n", + "from langchain.chains import RetrievalQA\n", + "from langchain.document_loaders import TextLoader\n", + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.llms import OpenAI\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import Chroma\n", + "\n", + "text_file_url = \"https://raw.githubusercontent.com/hwchase17/chat-your-data/master/state_of_the_union.txt\"\n", + "\n", + "openai_api_key = \"sk-XXX\"\n", + "\n", + "with open(\"state_of_the_union.txt\", \"w\") as f:\n", + " response = requests.get(text_file_url)\n", + " f.write(response.text)\n", + "\n", + "loader = TextLoader(\"state_of_the_union.txt\")\n", + "documents = loader.load()\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "texts = text_splitter.split_documents(documents)\n", + "\n", + "embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)\n", + "docsearch = Chroma.from_documents(texts, embeddings)\n", + "\n", + "qa = RetrievalQA.from_chain_type(\n", + " llm=OpenAI(openai_api_key=openai_api_key), chain_type=\"stuff\",\n", + " retriever=docsearch.as_retriever()\n", + ")\n", + "\n", + "# Providing a new question-answering pipeline\n", + "query = \"Who is the president?\"\n", + "result = qa.run(query)" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After defining a chain, you can then manually check for answer similarity." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "answer_relevancy_metric.measure(result, query)\n", + "answer_relevancy_metric.is_successful()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What's next?\n", + "\n", + "You can create your own custom metrics [here](https://docs.confident-ai.com/docs/quickstart/custom-metrics). \n", + "\n", + "DeepEval also offers other features such as being able to [automatically create unit tests](https://docs.confident-ai.com/docs/quickstart/synthetic-data-creation), [tests for hallucination](https://docs.confident-ai.com/docs/measuring_llm_performance/factual_consistency).\n", + "\n", + "If you are interested, check out our Github repository here [https://github.com/confident-ai/deepeval](https://github.com/confident-ai/deepeval). We welcome any PRs and discussions on how to improve LLM performance." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + }, + "vscode": { + "interpreter": { + "hash": "a53ebf4a859167383b364e7e7521d0add3c2dbbdecce4edf676e8c4634ff3fbb" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/extras/integrations/callbacks/llmonitor.md b/docs/extras/integrations/callbacks/llmonitor.md index daec3dad81..9d81ce12f1 100644 --- a/docs/extras/integrations/callbacks/llmonitor.md +++ b/docs/extras/integrations/callbacks/llmonitor.md @@ -1,19 +1,23 @@ # LLMonitor -[LLMonitor](https://llmonitor.com) is an open-source observability platform that provides cost tracking, user tracking and powerful agent tracing. +[LLMonitor](https://llmonitor.com?utm_source=langchain&utm_medium=py&utm_campaign=docs) is an open-source observability platform that provides cost and usage analytics, user tracking, tracing and evaluation tools. ## Setup -Create an account on [llmonitor.com](https://llmonitor.com), create an `App`, and then copy the associated `tracking id`. + +Create an account on [llmonitor.com](https://llmonitor.com?utm_source=langchain&utm_medium=py&utm_campaign=docs), then copy your new app's `tracking id`. + Once you have it, set it as an environment variable by running: + ```bash export LLMONITOR_APP_ID="..." ``` If you'd prefer not to set an environment variable, you can pass the key directly when initializing the callback handler: + ```python from langchain.callbacks import LLMonitorCallbackHandler @@ -21,12 +25,13 @@ handler = LLMonitorCallbackHandler(app_id="...") ``` ## Usage with LLM/Chat models + ```python from langchain.llms import OpenAI from langchain.chat_models import ChatOpenAI from langchain.callbacks import LLMonitorCallbackHandler -handler = LLMonitorCallbackHandler(app_id="...") +handler = LLMonitorCallbackHandler() llm = OpenAI( callbacks=[handler], @@ -38,26 +43,63 @@ chat = ChatOpenAI( ) ``` +## Usage with chains and agents + +Make sure to pass the callback handler to the `run` method so that all related chains and llm calls are correctly tracked. + +It is also recommended to pass `agent_name` in the metadata to be able to distinguish between agents in the dashboard. + +Example: + +```python +from langchain.chat_models import ChatOpenAI +from langchain.schema import SystemMessage, HumanMessage +from langchain.agents import OpenAIFunctionsAgent, AgentExecutor, tool +from langchain.callbacks import LLMonitorCallbackHandler + +llm = ChatOpenAI(temperature=0) + +handler = LLMonitorCallbackHandler() + +@tool +def get_word_length(word: str) -> int: + """Returns the length of a word.""" + return len(word) + +tools = [get_word_length] + +prompt = OpenAIFunctionsAgent.create_prompt( + system_message=SystemMessage( + content="You are very powerful assistant, but bad at calculating lengths of words." + ) +) + +agent = OpenAIFunctionsAgent(llm=llm, tools=tools, prompt=prompt, verbose=True) +agent_executor = AgentExecutor( + agent=agent, tools=tools, verbose=True, metadata={"agent_name": "WordCount"} # <- recommended, assign a custom name +) +agent_executor.run("how many letters in the word educa?", callbacks=[handler]) +``` + +Another example: -## Usage with agents ```python from langchain.agents import load_tools, initialize_agent, AgentType from langchain.llms import OpenAI from langchain.callbacks import LLMonitorCallbackHandler -handler = LLMonitorCallbackHandler(app_id="...") +handler = LLMonitorCallbackHandler() llm = OpenAI(temperature=0) tools = load_tools(["serpapi", "llm-math"], llm=llm) -agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION) +agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, metadata={ "agent_name": "GirlfriendAgeFinder" }) # <- recommended, assign a custom name + agent.run( "Who is Leo DiCaprio's girlfriend? What is her current age raised to the 0.43 power?", callbacks=[handler], - metadata={ - "agentName": "Leo DiCaprio's girlfriend", # you can assign a custom agent in the metadata - }, ) ``` ## Support + For any question or issue with integration you can reach out to the LLMonitor team on [Discord](http://discord.com/invite/8PafSG58kK) or via [email](mailto:vince@llmonitor.com). diff --git a/docs/extras/integrations/chat/konko.ipynb b/docs/extras/integrations/chat/konko.ipynb new file mode 100644 index 0000000000..5884ff73c5 --- /dev/null +++ b/docs/extras/integrations/chat/konko.ipynb @@ -0,0 +1,164 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Konko\n", + "\n", + ">[Konko](https://www.konko.ai/) API is a fully managed Web API designed to help application developers:\n", + "\n", + "Konko API is a fully managed API designed to help application developers:\n", + "\n", + "1. Select the right LLM(s) for their application\n", + "2. Prototype with various open-source and proprietary LLMs\n", + "3. Move to production in-line with their security, privacy, throughput, latency SLAs without infrastructure set-up or administration using Konko AI's SOC 2 compliant infrastructure\n", + "\n", + "\n", + "This example goes over how to use LangChain to interact with `Konko` [models](https://docs.konko.ai/docs/overview)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To run this notebook, you'll need Konko API key. You can request it by messaging support@konko.ai." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatKonko\n", + "from langchain.prompts.chat import (\n", + " ChatPromptTemplate,\n", + " SystemMessagePromptTemplate,\n", + " AIMessagePromptTemplate,\n", + " HumanMessagePromptTemplate,\n", + ")\n", + "from langchain.schema import AIMessage, HumanMessage, SystemMessage" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Set API Keys\n", + "\n", + "
\n", + "\n", + "### Option 1: Set Environment Variables\n", + "\n", + "1. You can set environment variables for \n", + " 1. KONKO_API_KEY (Required)\n", + " 2. OPENAI_API_KEY (Optional)\n", + "2. In your current shell session, use the export command:\n", + "\n", + "```shell\n", + "export KONKO_API_KEY={your_KONKO_API_KEY_here}\n", + "export OPENAI_API_KEY={your_OPENAI_API_KEY_here} #Optional\n", + "```\n", + "\n", + "Alternatively, you can add the above lines directly to your shell startup script (such as .bashrc or .bash_profile for Bash shell and .zshrc for Zsh shell) to have them set automatically every time a new shell session starts.\n", + "\n", + "### Option 2: Set API Keys Programmatically\n", + "\n", + "If you prefer to set your API keys directly within your Python script or Jupyter notebook, you can use the following commands:\n", + "\n", + "```python\n", + "konko.set_api_key('your_KONKO_API_KEY_here') \n", + "konko.set_openai_api_key('your_OPENAI_API_KEY_here') # Optional\n", + "```\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Calling a model\n", + "\n", + "Find a model on the [Konko overview page](https://docs.konko.ai/docs/overview)\n", + "\n", + "For example, for this [LLama 2 model](https://docs.konko.ai/docs/meta-llama-2-13b-chat). The model id would be: `\"meta-llama/Llama-2-13b-chat-hf\"`\n", + "\n", + "Another way to find the list of models running on the Konko instance is through this [endpoint](https://docs.konko.ai/reference/listmodels).\n", + "\n", + "From here, we can initialize our model:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "chat = ChatKonko(max_tokens=400, model = 'meta-llama/Llama-2-13b-chat-hf')" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "AIMessage(content=\" Sure, I'd be happy to explain the Big Bang Theory briefly!\\n\\nThe Big Bang Theory is the leading explanation for the origin and evolution of the universe, based on a vast amount of observational evidence from many fields of science. In essence, the theory posits that the universe began as an infinitely hot and dense point, known as a singularity, around 13.8 billion years ago. This singularity expanded rapidly, and as it did, it cooled and formed subatomic particles, which eventually coalesced into the first atoms, and later into the stars and galaxies we see today.\\n\\nThe theory gets its name from the idea that the universe began in a state of incredibly high energy and temperature, and has been expanding and cooling ever since. This expansion is thought to have been driven by a mysterious force known as dark energy, which is thought to be responsible for the accelerating expansion of the universe.\\n\\nOne of the key predictions of the Big Bang Theory is that the universe should be homogeneous and isotropic on large scales, meaning that it should look the same in all directions and have the same properties everywhere. This prediction has been confirmed by a wealth of observational evidence, including the cosmic microwave background radiation, which is thought to be a remnant of the early universe.\\n\\nOverall, the Big Bang Theory is a well-established and widely accepted explanation for the origins of the universe, and it has been supported by a vast amount of observational evidence from many fields of science.\", additional_kwargs={}, example=False)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "messages = [\n", + " SystemMessage(\n", + " content=\"You are a helpful assistant.\"\n", + " ),\n", + " HumanMessage(\n", + " content=\"Explain Big Bang Theory briefly\"\n", + " ),\n", + "]\n", + "chat(messages)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + }, + "vscode": { + "interpreter": { + "hash": "a0a0263b650d907a3bfe41c0f8d6a63a071b884df3cfdc1579f00cdc1aed6b03" + } + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/extras/integrations/document_transformers/nuclia_transformer.ipynb b/docs/extras/integrations/document_transformers/nuclia_transformer.ipynb index d4317c9bba..468be00d94 100644 --- a/docs/extras/integrations/document_transformers/nuclia_transformer.ipynb +++ b/docs/extras/integrations/document_transformers/nuclia_transformer.ipynb @@ -18,7 +18,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -93,8 +93,22 @@ } ], "metadata": { + "kernelspec": { + "display_name": "langchain", + "language": "python", + "name": "python3" + }, "language_info": { - "name": "python" + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.5" }, "orig_nbformat": 4 }, diff --git a/docs/extras/integrations/llms/banana.ipynb b/docs/extras/integrations/llms/banana.ipynb index 44e51faafa..b92db8daba 100644 --- a/docs/extras/integrations/llms/banana.ipynb +++ b/docs/extras/integrations/llms/banana.ipynb @@ -31,11 +31,16 @@ "outputs": [], "source": [ "# get new tokens: https://app.banana.dev/\n", - "# We need two tokens, not just an `api_key`: `BANANA_API_KEY` and `YOUR_MODEL_KEY`\n", + "# We need three parameters to make a Banana.dev API call:\n", + "# * a team api key\n", + "# * the model's unique key\n", + "# * the model's url slug\n", "\n", "import os\n", "from getpass import getpass\n", "\n", + "# You can get this from the main dashboard\n", + "# at https://app.banana.dev\n", "os.environ[\"BANANA_API_KEY\"] = \"YOUR_API_KEY\"\n", "# OR\n", "# BANANA_API_KEY = getpass()" @@ -70,7 +75,9 @@ "metadata": {}, "outputs": [], "source": [ - "llm = Banana(model_key=\"YOUR_MODEL_KEY\")" + "# Both of these are found in your model's \n", + "# detail page in https://app.banana.dev\n", + "llm = Banana(model_key=\"YOUR_MODEL_KEY\", model_url_slug=\"YOUR_MODEL_URL_SLUG\")" ] }, { diff --git a/docs/extras/integrations/llms/ctranslate2.ipynb b/docs/extras/integrations/llms/ctranslate2.ipynb new file mode 100644 index 0000000000..1554e13c55 --- /dev/null +++ b/docs/extras/integrations/llms/ctranslate2.ipynb @@ -0,0 +1,240 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# CTranslate2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**CTranslate2** is a C++ and Python library for efficient inference with Transformer models.\n", + "\n", + "The project implements a custom runtime that applies many performance optimization techniques such as weights quantization, layers fusion, batch reordering, etc., to accelerate and reduce the memory usage of Transformer models on CPU and GPU.\n", + "\n", + "Full list of features and supported models is included in the [project's repository](https://opennmt.net/CTranslate2/guides/transformers.html). To start, please check out the official [quickstart guide](https://opennmt.net/CTranslate2/quickstart.html).\n", + "\n", + "To use, you should have `ctranslate2` python package installed." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install ctranslate2" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To use a Hugging Face model with CTranslate2, it has to be first converted to CTranslate2 format using the `ct2-transformers-converter` command. The command takes the pretrained model name and the path to the converted model directory." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|██████████████████| 2/2 [00:01<00:00, 1.81it/s]\n" + ] + } + ], + "source": [ + "# converstion can take several minutes\n", + "!ct2-transformers-converter --model meta-llama/Llama-2-7b-hf --quantization bfloat16 --output_dir ./llama-2-7b-ct2 --force" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.llms import CTranslate2\n", + "\n", + "llm = CTranslate2(\n", + " # output_dir from above:\n", + " model_path=\"./llama-2-7b-ct2\",\n", + " tokenizer_name=\"meta-llama/Llama-2-7b-hf\",\n", + " device=\"cuda\",\n", + " # device_index can be either single int or list or ints,\n", + " # indicating the ids of GPUs to use for inference:\n", + " device_index=[0,1], \n", + " compute_type=\"bfloat16\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Single call" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "He presented me with plausible evidence for the existence of unicorns: 1) they are mentioned in ancient texts; and, more importantly to him (and not so much as a matter that would convince most people), he had seen one.\n", + "I was skeptical but I didn't want my friend upset by his belief being dismissed outright without any consideration or argument on its behalf whatsoever - which is why we were having this conversation at all! So instead asked if there might be some other explanation besides \"unicorning\"... maybe it could have been an ostrich? Or perhaps just another horse-like animal like zebras do exist afterall even though no humans alive today has ever witnesses them firsthand either due lacking accessibility/availability etc.. But then again those animals aren’ t exactly known around here anyway…” And thus began our discussion about whether these creatures actually existed anywhere else outside Earth itself where only few scientists ventured before us nowadays because technology allows exploration beyond borders once thought impossible centuries ago when travel meant walking everywhere yourself until reaching destination point A->B via footsteps alone unless someone helped guide along way through woods full darkness nighttime hours\n" + ] + } + ], + "source": [ + "print(\n", + " llm(\n", + " \"He presented me with plausible evidence for the existence of unicorns: \",\n", + " max_length=256,\n", + " sampling_topk=50,\n", + " sampling_temperature=0.2,\n", + " repetition_penalty=2,\n", + " cache_static_prompt=False,\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Multiple calls:" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "generations=[[Generation(text='The list of top romantic songs:\\n1. “I Will Always Love You” by Whitney Houston\\n2. “Can’t Help Falling in Love” by Elvis Presley\\n3. “Unchained Melody” by The Righteous Brothers\\n4. “I Will Always Love You” by Dolly Parton\\n5. “I Will Always Love You” by Whitney Houston\\n6. “I Will Always Love You” by Dolly Parton\\n7. “I Will Always Love You” by The Beatles\\n8. “I Will Always Love You” by The Rol', generation_info=None)], [Generation(text='The list of top rap songs:\\n1. “God’s Plan” by Drake\\n2. “Rockstar” by Post Malone\\n3. “Bad and Boujee” by Migos\\n4. “Humble” by Kendrick Lamar\\n5. “Bodak Yellow” by Cardi B\\n6. “I’m the One” by DJ Khaled\\n7. “Motorsport” by Migos\\n8. “No Limit” by G-Eazy\\n9. “Bounce Back” by Big Sean\\n10. “', generation_info=None)]] llm_output=None run=[RunInfo(run_id=UUID('628e0491-a310-4d12-81db-6f2c5309d5c2')), RunInfo(run_id=UUID('f88fdbcd-c1f6-4f13-b575-810b80ecbaaf'))]\n" + ] + } + ], + "source": [ + "print(\n", + " llm.generate(\n", + " [\"The list of top romantic songs:\\n1.\", \"The list of top rap songs:\\n1.\"],\n", + " max_length=128\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Integrate the model in an LLMChain" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Who was the US president in the year the first Pokemon game was released?\n", + "\n", + "Let's think step by step. 1996 was the year the first Pokemon game was released.\n", + "\n", + "\\begin{blockquote}\n", + "\n", + "\\begin{itemize}\n", + " \\item 1996 was the year Bill Clinton was president.\n", + " \\item 1996 was the year the first Pokemon game was released.\n", + " \\item 1996 was the year the first Pokemon game was released.\n", + "\n", + "\\end{itemize}\n", + "\\end{blockquote}\n", + "\n", + "I'm not sure if this is a valid question, but I'm sure it's a fun one.\n", + "\n", + "Comment: I'm not sure if this is a valid question, but I'm sure it's a fun one.\n", + "\n", + "Comment: @JoeZ. I'm not sure if this is a valid question, but I'm sure it's a fun one.\n", + "\n", + "Comment: @JoeZ. I'm not sure if this is a valid question, but I'm sure it's a fun one.\n", + "\n", + "Comment: @JoeZ. I'm not sure if this is a valid question, but I'm sure it's a fun one.\n", + "\n", + "Comment: @JoeZ. I'm not sure if this is a valid question, but I'm sure it's a fun one.\n", + "\n", + "Comment: @JoeZ. I'm not sure if this is a valid question, but I'm sure it's a fun one.\n", + "\n", + "Comment: @JoeZ. I'm not sure if this is a valid question, but I'm sure it's a fun one.\n", + "\n", + "Comment: @JoeZ. I'm not sure if this is a valid question, but I'm sure it's a fun one.\n", + "\n", + "Comment: @JoeZ. I'm not sure if this is a valid question, but I'm sure it's a fun one.\n", + "\n", + "Comment: @JoeZ. I'm not sure if this is a valid question, but I'm sure it's a fun one.\n", + "\n", + "Comment: @JoeZ. I'm not sure if this is a valid question, but I'm sure it's a fun one.\n", + "\n" + ] + } + ], + "source": [ + "from langchain import PromptTemplate, LLMChain\n", + "\n", + "template = \"\"\"{question}\n", + "\n", + "Let's think step by step. \"\"\"\n", + "prompt = PromptTemplate(template=template, input_variables=[\"question\"])\n", + "\n", + "llm_chain = LLMChain(prompt=prompt, llm=llm)\n", + "\n", + "question = \"Who was the US president in the year the first Pokemon game was released?\"\n", + "\n", + "print(llm_chain.run(question))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.12 ('langchain_venv': venv)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "d1d3a3c58a58885896c5459933a599607cdbb9917d7e1ad7516c8786c51f2dd2" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/extras/integrations/providers/awadb.md b/docs/extras/integrations/providers/awadb.md index 7c2e9943f5..be6d4d66fe 100644 --- a/docs/extras/integrations/providers/awadb.md +++ b/docs/extras/integrations/providers/awadb.md @@ -9,13 +9,20 @@ pip install awadb ``` -## VectorStore +## Vector Store -There exists a wrapper around AwaDB vector databases, allowing you to use it as a vectorstore, -whether for semantic search or example selection. ```python from langchain.vectorstores import AwaDB ``` -For a more detailed walkthrough of the AwaDB wrapper, see [here](/docs/integrations/vectorstores/awadb.html). +See a [usage example](/docs/integrations/vectorstores/awadb). + + +## Text Embedding Model + +```python +from langchain.embeddings import AwaEmbeddings +``` + +See a [usage example](/docs/integrations/text_embedding/awadb). diff --git a/docs/extras/integrations/providers/bananadev.mdx b/docs/extras/integrations/providers/bananadev.mdx index 4961e5f88b..ee7992be74 100644 --- a/docs/extras/integrations/providers/bananadev.mdx +++ b/docs/extras/integrations/providers/bananadev.mdx @@ -1,79 +1,72 @@ # Banana -This page covers how to use the Banana ecosystem within LangChain. -It is broken into two parts: installation and setup, and then references to specific Banana wrappers. +Banana provided serverless GPU inference for AI models, including a CI/CD build pipeline and a simple Python framework (Potassium) to server your models. + +This page covers how to use the [Banana](https://www.banana.dev) ecosystem within LangChain. + +It is broken into two parts: +* installation and setup, +* and then references to specific Banana wrappers. ## Installation and Setup - Install with `pip install banana-dev` -- Get an Banana api key and set it as an environment variable (`BANANA_API_KEY`) +- Get an Banana api key from the [Banana.dev dashboard](https://app.banana.dev) and set it as an environment variable (`BANANA_API_KEY`) +- Get your model's key and url slug from the model's details page ## Define your Banana Template -If you want to use an available language model template you can find one [here](https://app.banana.dev/templates/conceptofmind/serverless-template-palmyra-base). -This template uses the Palmyra-Base model by [Writer](https://writer.com/product/api/). -You can check out an example Banana repository [here](https://github.com/conceptofmind/serverless-template-palmyra-base). +You'll need to set up a Github repo for your Banana app. You can get started in 5 minutes using [this guide](https://docs.banana.dev/banana-docs/). + +Alternatively, for a ready-to-go LLM example, you can check out Banana's [CodeLlama-7B-Instruct-GPTQ](https://github.com/bananaml/demo-codellama-7b-instruct-gptq) GitHub repository. Just fork it and deploy it within Banana. + +Other starter repos are available [here](https://github.com/orgs/bananaml/repositories?q=demo-&type=all&language=&sort=). ## Build the Banana app -Banana Apps must include the "output" key in the return json. -There is a rigid response structure. +To use Banana apps within Langchain, they must include the `outputs` key +in the returned json, and the value must be a string. ```python # Return the results as a dictionary -result = {'output': result} +result = {'outputs': result} ``` An example inference function would be: ```python -def inference(model_inputs:dict) -> dict: - global model - global tokenizer - - # Parse out your arguments - prompt = model_inputs.get('prompt', None) - if prompt == None: - return {'message': "No prompt provided"} - - # Run the model - input_ids = tokenizer.encode(prompt, return_tensors='pt').cuda() - output = model.generate( - input_ids, - max_length=100, - do_sample=True, - top_k=50, - top_p=0.95, - num_return_sequences=1, - temperature=0.9, - early_stopping=True, - no_repeat_ngram_size=3, - num_beams=5, - length_penalty=1.5, - repetition_penalty=1.5, - bad_words_ids=[[tokenizer.encode(' ', add_prefix_space=True)[0]]] - ) - - result = tokenizer.decode(output[0], skip_special_tokens=True) - # Return the results as a dictionary - result = {'output': result} - return result +@app.handler("/") +def handler(context: dict, request: Request) -> Response: + """Handle a request to generate code from a prompt.""" + model = context.get("model") + tokenizer = context.get("tokenizer") + max_new_tokens = request.json.get("max_new_tokens", 512) + temperature = request.json.get("temperature", 0.7) + prompt = request.json.get("prompt") + prompt_template=f'''[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```: + {prompt} + [/INST] + ''' + input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda() + output = model.generate(inputs=input_ids, temperature=temperature, max_new_tokens=max_new_tokens) + result = tokenizer.decode(output[0]) + return Response(json={"outputs": result}, status=200) ``` -You can find a full example of a Banana app [here](https://github.com/conceptofmind/serverless-template-palmyra-base/blob/main/app.py). +This example is from the `app.py` file in [CodeLlama-7B-Instruct-GPTQ](https://github.com/bananaml/demo-codellama-7b-instruct-gptq). ## Wrappers ### LLM -There exists an Banana LLM wrapper, which you can access with +Within Langchain, there exists a Banana LLM wrapper, which you can access with ```python from langchain.llms import Banana ``` -You need to provide a model key located in the dashboard: +You need to provide a model key and model url slug, which you can get from the model's details page in the [Banana.dev dashboard](https://app.banana.dev). ```python -llm = Banana(model_key="YOUR_MODEL_KEY") +llm = Banana(model_key="YOUR_MODEL_KEY", model_url_slug="YOUR_MODEL_URL_SLUG") ``` diff --git a/docs/extras/integrations/providers/confident.mdx b/docs/extras/integrations/providers/confident.mdx new file mode 100644 index 0000000000..9823e0c624 --- /dev/null +++ b/docs/extras/integrations/providers/confident.mdx @@ -0,0 +1,22 @@ +# Confident AI + +![Confident - Unit Testing for LLMs](https://github.com/confident-ai/deepeval) + +>[DeepEval](https://confident-ai.com) package for unit testing LLMs. +> Using Confident, everyone can build robust language models through faster iterations +> using both unit testing and integration testing. We provide support for each step in the iteration +> from synthetic data creation to testing. + +## Installation and Setup + +First, you'll need to install the `DeepEval` Python package as follows: + +```bash +pip install deepeval +``` + +Afterwards, you can get started in as little as a few lines of code. + +```python +from langchain.callbacks import DeepEvalCallback +``` diff --git a/docs/extras/integrations/providers/konko.mdx b/docs/extras/integrations/providers/konko.mdx new file mode 100644 index 0000000000..1735aa0d01 --- /dev/null +++ b/docs/extras/integrations/providers/konko.mdx @@ -0,0 +1,80 @@ +# Konko +This page covers how to run models on Konko within LangChain. + +Konko API is a fully managed API designed to help application developers: + +Select the right LLM(s) for their application +Prototype with various open-source and proprietary LLMs +Move to production in-line with their security, privacy, throughput, latency SLAs without infrastructure set-up or administration using Konko AI's SOC 2 compliant infrastructure + +## Installation and Setup + +### First you'll need an API key +You can request it by messaging [support@konko.ai](mailto:support@konko.ai) + +### Install Konko AI's Python SDK + +#### 1. Enable a Python3.8+ environment + +#### 2. Set API Keys + +##### Option 1: Set Environment Variables + +1. You can set environment variables for + 1. KONKO_API_KEY (Required) + 2. OPENAI_API_KEY (Optional) + +2. In your current shell session, use the export command: + +```shell +export KONKO_API_KEY={your_KONKO_API_KEY_here} +export OPENAI_API_KEY={your_OPENAI_API_KEY_here} #Optional +``` + +Alternatively, you can add the above lines directly to your shell startup script (such as .bashrc or .bash_profile for Bash shell and .zshrc for Zsh shell) to have them set automatically every time a new shell session starts. + +##### Option 2: Set API Keys Programmatically + +If you prefer to set your API keys directly within your Python script or Jupyter notebook, you can use the following commands: + +```python +konko.set_api_key('your_KONKO_API_KEY_here') +konko.set_openai_api_key('your_OPENAI_API_KEY_here') # Optional +``` + +#### 3. Install the SDK + + +```shell +pip install konko +``` + +#### 4. Verify Installation & Authentication + +```python +#Confirm konko has installed successfully +import konko +#Confirm API keys from Konko and OpenAI are set properly +konko.Model.list() +``` + +## Calling a model + +Find a model on the [Konko Introduction page](https://docs.konko.ai/docs#available-models) + +For example, for this [LLama 2 model](https://docs.konko.ai/docs/meta-llama-2-13b-chat). The model id would be: `"meta-llama/Llama-2-13b-chat-hf"` + +Another way to find the list of models running on the Konko instance is through this [endpoint](https://docs.konko.ai/reference/listmodels). + +From here, we can initialize our model: + +```python +chat_instance = ChatKonko(max_tokens=10, model = 'meta-llama/Llama-2-13b-chat-hf') +``` + +And run it: + +```python +msg = HumanMessage(content="Hi") +chat_response = chat_instance([msg]) +``` diff --git a/docs/extras/integrations/providers/modelscope.mdx b/docs/extras/integrations/providers/modelscope.mdx index c37c5f60c4..df6add2bb1 100644 --- a/docs/extras/integrations/providers/modelscope.mdx +++ b/docs/extras/integrations/providers/modelscope.mdx @@ -1,20 +1,24 @@ # ModelScope +>[ModelScope](https://www.modelscope.cn/home) is a big repository of the models and datasets. + This page covers how to use the modelscope ecosystem within LangChain. It is broken into two parts: installation and setup, and then references to specific modelscope wrappers. ## Installation and Setup -* Install the Python SDK with `pip install modelscope` +Install the `modelscope` package. + +```bash +pip install modelscope +``` -## Wrappers -### Embeddings +## Text Embedding Models -There exists a modelscope Embeddings wrapper, which you can access with ```python from langchain.embeddings import ModelScopeEmbeddings ``` -For a more detailed walkthrough of this, see [this notebook](/docs/integrations/text_embedding/modelscope_hub.html) +For a more detailed walkthrough of this, see [this notebook](/docs/integrations/text_embedding/modelscope_hub) diff --git a/docs/extras/integrations/providers/nlpcloud.mdx b/docs/extras/integrations/providers/nlpcloud.mdx index 050da5af04..e401faeb5a 100644 --- a/docs/extras/integrations/providers/nlpcloud.mdx +++ b/docs/extras/integrations/providers/nlpcloud.mdx @@ -1,17 +1,31 @@ # NLPCloud -This page covers how to use the NLPCloud ecosystem within LangChain. -It is broken into two parts: installation and setup, and then references to specific NLPCloud wrappers. +>[NLP Cloud](https://docs.nlpcloud.com/#introduction) is an artificial intelligence platform that allows you to use the most advanced AI engines, and even train your own engines with your own data. + ## Installation and Setup -- Install the Python SDK with `pip install nlpcloud` + +- Install the `nlpcloud` package. + +```bash +pip install nlpcloud +``` + - Get an NLPCloud api key and set it as an environment variable (`NLPCLOUD_API_KEY`) -## Wrappers -### LLM +## LLM + +See a [usage example](/docs/integrations/llms/nlpcloud). -There exists an NLPCloud LLM wrapper, which you can access with ```python from langchain.llms import NLPCloud ``` + +## Text Embedding Models + +See a [usage example](/docs/integrations/text_embedding/nlp_cloud) + +```python +from langchain.embeddings import NLPCloudEmbeddings +``` diff --git a/docs/extras/integrations/providers/portkey/index.md b/docs/extras/integrations/providers/portkey/index.md index 51a9962386..daefe35a22 100644 --- a/docs/extras/integrations/providers/portkey/index.md +++ b/docs/extras/integrations/providers/portkey/index.md @@ -1,4 +1,10 @@ # Portkey + +>[Portkey](https://docs.portkey.ai/overview/introduction) is a platform designed to streamline the deployment +> and management of Generative AI applications. +> It provides comprehensive features for monitoring, managing models, +> and improving the performance of your AI applications. + ## LLMOps for Langchain Portkey brings production readiness to Langchain. With Portkey, you can diff --git a/docs/extras/integrations/providers/portkey/logging_tracing_portkey.ipynb b/docs/extras/integrations/providers/portkey/logging_tracing_portkey.ipynb index e26fabd659..7fd2cd4161 100644 --- a/docs/extras/integrations/providers/portkey/logging_tracing_portkey.ipynb +++ b/docs/extras/integrations/providers/portkey/logging_tracing_portkey.ipynb @@ -1,19 +1,14 @@ { "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Log, Trace, and Monitor Langchain LLM Calls\n", + "# Log, Trace, and Monitor\n", "\n", "When building apps or agents using Langchain, you end up making multiple API calls to fulfill a single user request. However, these requests are not chained when you want to analyse them. With [**Portkey**](/docs/ecosystem/integrations/portkey), all the embeddings, completion, and other requests from a single user request will get logged and traced to a common ID, enabling you to gain full visibility of user interactions.\n", "\n", - "This notebook serves as a step-by-step guide on how to integrate and use Portkey in your Langchain app." + "This notebook serves as a step-by-step guide on how to log, trace, and monitor Langchain LLM calls using `Portkey` in your Langchain app." ] }, { @@ -234,9 +229,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/docs/extras/integrations/providers/spacy.mdx b/docs/extras/integrations/providers/spacy.mdx index f4d49497dd..ab9b685898 100644 --- a/docs/extras/integrations/providers/spacy.mdx +++ b/docs/extras/integrations/providers/spacy.mdx @@ -18,3 +18,11 @@ See a [usage example](/docs/modules/data_connection/document_transformers/text_s ```python from langchain.text_splitter import SpacyTextSplitter ``` + +## Text Embedding Models + +See a [usage example](/docs/integrations/text_embedding/spacy_embedding) + +```python +from langchain.embeddings.spacy_embeddings import SpacyEmbeddings +``` diff --git a/docs/extras/integrations/providers/vectara/index.mdx b/docs/extras/integrations/providers/vectara/index.mdx index 0e30735f0b..abd8283735 100644 --- a/docs/extras/integrations/providers/vectara/index.mdx +++ b/docs/extras/integrations/providers/vectara/index.mdx @@ -11,9 +11,10 @@ What is Vectara? - You can use Vectara's integration with LangChain as a Vector store or using the Retriever abstraction. ## Installation and Setup -To use Vectara with LangChain no special installation steps are required. You just have to provide your customer_id, corpus ID, and an API key created within the Vectara console to enable indexing and searching. +To use Vectara with LangChain no special installation steps are required. +To get started, follow our [quickstart](https://docs.vectara.com/docs/quickstart) guide to create an account, a corpus and an API key. +Once you have these, you can provide them as arguments to the Vectara vectorstore, or you can set them as environment variables. -Alternatively these can be provided as environment variables - export `VECTARA_CUSTOMER_ID`="your_customer_id" - export `VECTARA_CORPUS_ID`="your_corpus_id" - export `VECTARA_API_KEY`="your-vectara-api-key" diff --git a/docs/extras/integrations/text_embedding/Awa.ipynb b/docs/extras/integrations/text_embedding/awadb.ipynb similarity index 89% rename from docs/extras/integrations/text_embedding/Awa.ipynb rename to docs/extras/integrations/text_embedding/awadb.ipynb index 1fb7ddca6f..f2c1e73392 100644 --- a/docs/extras/integrations/text_embedding/Awa.ipynb +++ b/docs/extras/integrations/text_embedding/awadb.ipynb @@ -5,9 +5,11 @@ "id": "b14a24db", "metadata": {}, "source": [ - "# AwaEmbedding\n", + "# AwaDB\n", "\n", - "This notebook explains how to use AwaEmbedding, which is included in [awadb](https://github.com/awa-ai/awadb), to embedding texts in langchain." + ">[AwaDB](https://github.com/awa-ai/awadb) is an AI Native database for the search and storage of embedding vectors used by LLM Applications.\n", + "\n", + "This notebook explains how to use `AwaEmbeddings` in LangChain." ] }, { @@ -101,7 +103,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.4" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/docs/extras/integrations/text_embedding/bedrock.ipynb b/docs/extras/integrations/text_embedding/bedrock.ipynb index 7c16cb8ead..0dbbcd080f 100644 --- a/docs/extras/integrations/text_embedding/bedrock.ipynb +++ b/docs/extras/integrations/text_embedding/bedrock.ipynb @@ -5,7 +5,9 @@ "id": "75e378f5-55d7-44b6-8e2e-6d7b8b171ec4", "metadata": {}, "source": [ - "# Bedrock Embeddings" + "# Bedrock\n", + "\n", + ">[Amazon Bedrock](https://aws.amazon.com/bedrock/) is a fully managed service that makes FMs from leading AI startups and Amazon available via an API, so you can choose from a wide range of FMs to find the model that is best suited for your use case.\n" ] }, { @@ -91,7 +93,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/docs/extras/integrations/text_embedding/bge_huggingface.ipynb b/docs/extras/integrations/text_embedding/bge_huggingface.ipynb index bcf196fc20..923ba92874 100644 --- a/docs/extras/integrations/text_embedding/bge_huggingface.ipynb +++ b/docs/extras/integrations/text_embedding/bge_huggingface.ipynb @@ -5,26 +5,29 @@ "id": "719619d3", "metadata": {}, "source": [ - "# BGE Hugging Face Embeddings\n", + "# BGE on Hugging Face\n", "\n", - "This notebook shows how to use BGE Embeddings through Hugging Face" + ">[BGE models on the HuggingFace](https://huggingface.co/BAAI/bge-large-en) are [the best open-source embedding models](https://huggingface.co/spaces/mteb/leaderboard).\n", + ">BGE model is created by the [Beijing Academy of Artificial Intelligence (BAAI)](https://www.baai.ac.cn/english.html). `BAAI` is a private non-profit organization engaged in AI research and development.\n", + "\n", + "This notebook shows how to use `BGE Embeddings` through `Hugging Face`" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "f7a54279", "metadata": { "scrolled": true }, "outputs": [], "source": [ - "# !pip install sentence_transformers" + "#!pip install sentence_transformers" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "9e1d5b6b", "metadata": {}, "outputs": [], @@ -43,12 +46,24 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 5, "id": "e59d1a89", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "384" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "embedding = hf.embed_query(\"hi this is harrison\")" + "embedding = hf.embed_query(\"hi this is harrison\")\n", + "len(embedding)" ] }, { @@ -76,7 +91,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.1" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/docs/extras/integrations/text_embedding/google_vertex_ai_palm.ipynb b/docs/extras/integrations/text_embedding/google_vertex_ai_palm.ipynb index ea607467fb..4c0c515e80 100644 --- a/docs/extras/integrations/text_embedding/google_vertex_ai_palm.ipynb +++ b/docs/extras/integrations/text_embedding/google_vertex_ai_palm.ipynb @@ -1,13 +1,14 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "# Google Cloud Platform Vertex AI PaLM \n", + "# Google Vertex AI PaLM \n", "\n", - "Note: This is seperate from the Google PaLM integration, it exposes [Vertex AI PaLM API](https://cloud.google.com/vertex-ai/docs/generative-ai/learn/overview) on Google Cloud. \n", + ">[Vertex AI PaLM API](https://cloud.google.com/vertex-ai/docs/generative-ai/learn/overview) is a service on Google Cloud exposing the embedding models. \n", + "\n", + "Note: This integration is seperate from the Google PaLM integration.\n", "\n", "By default, Google Cloud [does not use](https://cloud.google.com/vertex-ai/docs/generative-ai/data-governance#foundation_model_development) Customer Data to train its foundation models as part of Google Cloud`s AI/ML Privacy Commitment. More details about how Google processes data can also be found in [Google's Customer Data Processing Addendum (CDPA)](https://cloud.google.com/terms/data-processing-addendum).\n", "\n", @@ -96,7 +97,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.12" }, "vscode": { "interpreter": { diff --git a/docs/extras/integrations/text_embedding/huggingfacehub.ipynb b/docs/extras/integrations/text_embedding/huggingfacehub.ipynb index a86df86d74..cb897f8693 100644 --- a/docs/extras/integrations/text_embedding/huggingfacehub.ipynb +++ b/docs/extras/integrations/text_embedding/huggingfacehub.ipynb @@ -5,13 +5,23 @@ "id": "ed47bb62", "metadata": {}, "source": [ - "# Hugging Face Hub\n", + "# Hugging Face\n", "Let's load the Hugging Face Embedding class." ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, + "id": "16b20335-da1d-46ba-aa23-fbf3e2c6fe60", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install langchain sentence_transformers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, "id": "861521a9", "metadata": {}, "outputs": [], @@ -21,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 3, "id": "ff9be586", "metadata": {}, "outputs": [], @@ -31,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "id": "d0a98ae9", "metadata": {}, "outputs": [], @@ -41,7 +51,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 5, "id": "5d6c682b", "metadata": {}, "outputs": [], @@ -51,7 +61,28 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 6, + "id": "b57b8ce9-ef7d-4e63-979e-aa8763d1f9a8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[-0.04895168915390968, -0.03986193612217903, -0.021562768146395683]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query_result[:3]" + ] + }, + { + "cell_type": "code", + "execution_count": 7, "id": "bb5e74c0", "metadata": {}, "outputs": [], @@ -59,20 +90,72 @@ "doc_result = embeddings.embed_documents([text])" ] }, + { + "cell_type": "markdown", + "id": "92019ef1-5d30-4985-b4e6-c0d98bdfe265", + "metadata": {}, + "source": [ + "## Hugging Face Inference API\n", + "We can also access embedding models via the Hugging Face Inference API, which does not require us to install ``sentence_transformers`` and download models locally." + ] + }, { "cell_type": "code", - "execution_count": null, - "id": "aaad49f8", + "execution_count": 1, + "id": "66f5c6ba-1446-43e1-b012-800d17cef300", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + "Enter your HF Inference API Key:\n", + "\n", + " ········\n" + ] + } + ], + "source": [ + "import getpass\n", + "\n", + "inference_api_key = getpass.getpass(\"Enter your HF Inference API Key:\\n\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d0623c1f-cd82-4862-9bce-3655cb9b66ac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[-0.038338541984558105, 0.1234646737575531, -0.028642963618040085]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings\n", + "\n", + "embeddings = HuggingFaceInferenceAPIEmbeddings(\n", + " api_key=inference_api_key,\n", + " model_name=\"sentence-transformers/all-MiniLM-l6-v2\"\n", + ")\n", + "\n", + "query_result = embeddings.embed_query(text)\n", + "query_result[:3]" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "poetry-venv", "language": "python", - "name": "python3" + "name": "poetry-venv" }, "language_info": { "codemirror_mode": { diff --git a/docs/extras/integrations/text_embedding/modelscope_hub.ipynb b/docs/extras/integrations/text_embedding/modelscope_hub.ipynb index 765d46769c..e2f47c4f3a 100644 --- a/docs/extras/integrations/text_embedding/modelscope_hub.ipynb +++ b/docs/extras/integrations/text_embedding/modelscope_hub.ipynb @@ -1,12 +1,13 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# ModelScope\n", "\n", + ">[ModelScope](https://www.modelscope.cn/home) is big repository of the models and datasets.\n", + "\n", "Let's load the ModelScope Embedding class." ] }, @@ -67,16 +68,23 @@ ], "metadata": { "kernelspec": { - "display_name": "chatgpt", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "3.9.15" - }, - "orig_nbformat": 4 + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/docs/extras/integrations/text_embedding/mosaicml.ipynb b/docs/extras/integrations/text_embedding/mosaicml.ipynb index 2d91c8d9c5..24d7aecb72 100644 --- a/docs/extras/integrations/text_embedding/mosaicml.ipynb +++ b/docs/extras/integrations/text_embedding/mosaicml.ipynb @@ -1,15 +1,14 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "# MosaicML embeddings\n", + "# MosaicML\n", "\n", - "[MosaicML](https://docs.mosaicml.com/en/latest/inference.html) offers a managed inference service. You can either use a variety of open source models, or deploy your own.\n", + ">[MosaicML](https://docs.mosaicml.com/en/latest/inference.html) offers a managed inference service. You can either use a variety of open source models, or deploy your own.\n", "\n", - "This example goes over how to use LangChain to interact with MosaicML Inference for text embedding." + "This example goes over how to use LangChain to interact with `MosaicML` Inference for text embedding." ] }, { @@ -94,6 +93,11 @@ } ], "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, "language_info": { "codemirror_mode": { "name": "ipython", @@ -103,9 +107,10 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3" + "pygments_lexer": "ipython3", + "version": "3.10.12" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/docs/extras/integrations/text_embedding/nlp_cloud.ipynb b/docs/extras/integrations/text_embedding/nlp_cloud.ipynb index 73ae71fe0f..9567d59c4b 100644 --- a/docs/extras/integrations/text_embedding/nlp_cloud.ipynb +++ b/docs/extras/integrations/text_embedding/nlp_cloud.ipynb @@ -7,7 +7,7 @@ "source": [ "# NLP Cloud\n", "\n", - "NLP Cloud is an artificial intelligence platform that allows you to use the most advanced AI engines, and even train your own engines with your own data. \n", + ">[NLP Cloud](https://docs.nlpcloud.com/#introduction) is an artificial intelligence platform that allows you to use the most advanced AI engines, and even train your own engines with your own data. \n", "\n", "The [embeddings](https://docs.nlpcloud.com/#embeddings) endpoint offers the following model:\n", "\n", @@ -80,7 +80,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.11.2 64-bit", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -94,7 +94,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.2" + "version": "3.10.12" }, "vscode": { "interpreter": { diff --git a/docs/extras/integrations/text_embedding/sagemaker-endpoint.ipynb b/docs/extras/integrations/text_embedding/sagemaker-endpoint.ipynb index fe5299ae6f..ec80112e10 100644 --- a/docs/extras/integrations/text_embedding/sagemaker-endpoint.ipynb +++ b/docs/extras/integrations/text_embedding/sagemaker-endpoint.ipynb @@ -5,11 +5,13 @@ "id": "1f83f273", "metadata": {}, "source": [ - "# SageMaker Endpoint Embeddings\n", + "# SageMaker\n", "\n", - "Let's load the SageMaker Endpoints Embeddings class. The class can be used if you host, e.g. your own Hugging Face model on SageMaker.\n", + "Let's load the `SageMaker Endpoints Embeddings` class. The class can be used if you host, e.g. your own Hugging Face model on SageMaker.\n", "\n", - "For instructions on how to do this, please see [here](https://www.philschmid.de/custom-inference-huggingface-sagemaker). **Note**: In order to handle batched requests, you will need to adjust the return line in the `predict_fn()` function within the custom `inference.py` script:\n", + "For instructions on how to do this, please see [here](https://www.philschmid.de/custom-inference-huggingface-sagemaker). \n", + "\n", + "**Note**: In order to handle batched requests, you will need to adjust the return line in the `predict_fn()` function within the custom `inference.py` script:\n", "\n", "Change from\n", "\n", @@ -143,7 +145,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.12" }, "vscode": { "interpreter": { diff --git a/docs/extras/integrations/text_embedding/self-hosted.ipynb b/docs/extras/integrations/text_embedding/self-hosted.ipynb index 00c497220e..47faa6bf2d 100644 --- a/docs/extras/integrations/text_embedding/self-hosted.ipynb +++ b/docs/extras/integrations/text_embedding/self-hosted.ipynb @@ -5,8 +5,8 @@ "id": "eec4efda", "metadata": {}, "source": [ - "# Self Hosted Embeddings\n", - "Let's load the SelfHostedEmbeddings, SelfHostedHuggingFaceEmbeddings, and SelfHostedHuggingFaceInstructEmbeddings classes." + "# Self Hosted\n", + "Let's load the `SelfHostedEmbeddings`, `SelfHostedHuggingFaceEmbeddings`, and `SelfHostedHuggingFaceInstructEmbeddings` classes." ] }, { @@ -149,9 +149,7 @@ "cell_type": "code", "execution_count": null, "id": "fc1bfd0f", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "query_result = embeddings.embed_query(text)" @@ -182,7 +180,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.12" }, "vscode": { "interpreter": { diff --git a/docs/extras/integrations/text_embedding/sentence_transformers.ipynb b/docs/extras/integrations/text_embedding/sentence_transformers.ipynb index 67eb83ab7c..e4649e6b71 100644 --- a/docs/extras/integrations/text_embedding/sentence_transformers.ipynb +++ b/docs/extras/integrations/text_embedding/sentence_transformers.ipynb @@ -1,16 +1,15 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "id": "ed47bb62", "metadata": {}, "source": [ - "# Sentence Transformers Embeddings\n", + "# Sentence Transformers\n", "\n", - "[SentenceTransformers](https://www.sbert.net/) embeddings are called using the `HuggingFaceEmbeddings` integration. We have also added an alias for `SentenceTransformerEmbeddings` for users who are more familiar with directly using that package.\n", + ">[SentenceTransformers](https://www.sbert.net/) embeddings are called using the `HuggingFaceEmbeddings` integration. We have also added an alias for `SentenceTransformerEmbeddings` for users who are more familiar with directly using that package.\n", "\n", - "SentenceTransformers is a python package that can generate text and image embeddings, originating from [Sentence-BERT](https://arxiv.org/abs/1908.10084)" + "`SentenceTransformers` is a python package that can generate text and image embeddings, originating from [Sentence-BERT](https://arxiv.org/abs/1908.10084)" ] }, { @@ -109,7 +108,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.10.12" }, "vscode": { "interpreter": { diff --git a/docs/extras/integrations/text_embedding/spacy_embedding.ipynb b/docs/extras/integrations/text_embedding/spacy_embedding.ipynb index bfea82d5d4..edda4828b4 100644 --- a/docs/extras/integrations/text_embedding/spacy_embedding.ipynb +++ b/docs/extras/integrations/text_embedding/spacy_embedding.ipynb @@ -1,21 +1,31 @@ { "cells": [ { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "# Spacy Embedding\n", + "# SpaCy\n", "\n", - "### Loading the Spacy embedding class to generate and query embeddings" + ">[spaCy](https://spacy.io/) is an open-source software library for advanced natural language processing, written in the programming languages Python and Cython.\n", + " \n", + "\n", + "## Installation and Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install spacy" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "#### Import the necessary classes" + "Import the necessary classes" ] }, { @@ -28,11 +38,12 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "#### Initialize SpacyEmbeddings.This will load the Spacy model into memory." + "## Example\n", + "\n", + "Initialize SpacyEmbeddings.This will load the Spacy model into memory." ] }, { @@ -45,11 +56,10 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "#### Define some example texts . These could be any documents that you want to analyze - for example, news articles, social media posts, or product reviews." + "Define some example texts . These could be any documents that you want to analyze - for example, news articles, social media posts, or product reviews." ] }, { @@ -67,11 +77,10 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "#### Generate and print embeddings for the texts . The SpacyEmbeddings class generates an embedding for each document, which is a numerical representation of the document's content. These embeddings can be used for various natural language processing tasks, such as document similarity comparison or text classification." + "Generate and print embeddings for the texts . The SpacyEmbeddings class generates an embedding for each document, which is a numerical representation of the document's content. These embeddings can be used for various natural language processing tasks, such as document similarity comparison or text classification." ] }, { @@ -86,11 +95,10 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "#### Generate and print an embedding for a single piece of text. You can also generate an embedding for a single piece of text, such as a search query. This can be useful for tasks like information retrieval, where you want to find documents that are similar to a given query." + "Generate and print an embedding for a single piece of text. You can also generate an embedding for a single piece of text, such as a search query. This can be useful for tasks like information retrieval, where you want to find documents that are similar to a given query." ] }, { @@ -106,11 +114,24 @@ } ], "metadata": { - "language_info": { - "name": "python" + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "orig_nbformat": 4 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/docs/extras/integrations/vectorstores/nucliadb.ipynb b/docs/extras/integrations/vectorstores/nucliadb.ipynb new file mode 100644 index 0000000000..f5fe2299c8 --- /dev/null +++ b/docs/extras/integrations/vectorstores/nucliadb.ipynb @@ -0,0 +1,126 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# NucliaDB\n", + "\n", + "You can use a local NucliaDB instance or use [Nuclia Cloud](https://nuclia.cloud).\n", + "\n", + "When using a local instance, you need a Nuclia Understanding API key, so your texts are properly vectorized and indexed. You can get a key by creating a free account at [https://nuclia.cloud](https://nuclia.cloud), and then [create a NUA key](https://docs.nuclia.dev/docs/docs/using/understanding/intro)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install langchain nuclia" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Usage with nuclia.cloud" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.vectorstores.nucliadb import NucliaDB\n", + "API_KEY = \"YOUR_API_KEY\"\n", + "\n", + "ndb = NucliaDB(knowledge_box=\"YOUR_KB_ID\", local=False, api_key=API_KEY)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Usage with a local instance\n", + "\n", + "Note: By default `backend` is set to `http://localhost:8080`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.vectorstores.nucliadb import NucliaDB\n", + "\n", + "ndb = NucliaDB(knowledge_box=\"YOUR_KB_ID\", local=True, backend=\"http://my-local-server\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add and delete texts to your Knowledge Box" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ids = ndb.add_texts([\"This is a new test\", \"This is a second test\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ndb.delete(ids=ids)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Search in your Knowledge Box" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = ndb.similarity_search(\"Who was inspired by Ada Lovelace?\")\n", + "print(res.page_content)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/extras/integrations/vectorstores/pgvector.ipynb b/docs/extras/integrations/vectorstores/pgvector.ipynb index 8ef6ec1fa2..397758f216 100644 --- a/docs/extras/integrations/vectorstores/pgvector.ipynb +++ b/docs/extras/integrations/vectorstores/pgvector.ipynb @@ -24,42 +24,11 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": null, "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: pgvector in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (0.1.8)\n", - "Requirement already satisfied: numpy in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from pgvector) (1.24.3)\n", - "Requirement already satisfied: openai in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (0.27.7)\n", - "Requirement already satisfied: requests>=2.20 in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from openai) (2.28.2)\n", - "Requirement already satisfied: tqdm in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from openai) (4.65.0)\n", - "Requirement already satisfied: aiohttp in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from openai) (3.8.4)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from requests>=2.20->openai) (3.1.0)\n", - "Requirement already satisfied: idna<4,>=2.5 in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from requests>=2.20->openai) (3.4)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from requests>=2.20->openai) (1.26.15)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from requests>=2.20->openai) (2023.5.7)\n", - "Requirement already satisfied: attrs>=17.3.0 in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from aiohttp->openai) (23.1.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from aiohttp->openai) (6.0.4)\n", - "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from aiohttp->openai) (4.0.2)\n", - "Requirement already satisfied: yarl<2.0,>=1.0 in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from aiohttp->openai) (1.9.2)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from aiohttp->openai) (1.3.3)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from aiohttp->openai) (1.3.1)\n", - "Requirement already satisfied: psycopg2-binary in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (2.9.6)\n", - "Requirement already satisfied: tiktoken in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (0.4.0)\n", - "Requirement already satisfied: regex>=2022.1.18 in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from tiktoken) (2023.5.5)\n", - "Requirement already satisfied: requests>=2.26.0 in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from tiktoken) (2.28.2)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (3.1.0)\n", - "Requirement already satisfied: idna<4,>=2.5 in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (3.4)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (1.26.15)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /Users/joyeed/langchain/langchain/.venv/lib/python3.9/site-packages (from requests>=2.26.0->tiktoken) (2023.5.7)\n" - ] - } - ], + "outputs": [], "source": [ "# Pip install necessary package\n", "!pip install pgvector\n", @@ -77,17 +46,14 @@ }, { "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "OpenAI API Key:········\n" - ] + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2023-09-09T08:02:16.802456Z", + "start_time": "2023-09-09T08:02:07.065604Z" } - ], + }, + "outputs": [], "source": [ "import os\n", "import getpass\n", @@ -97,18 +63,20 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 3, "metadata": { - "tags": [] + "tags": [], + "ExecuteTime": { + "end_time": "2023-09-09T08:02:19.742896Z", + "start_time": "2023-09-09T08:02:19.732527Z" + } }, "outputs": [ { "data": { - "text/plain": [ - "False" - ] + "text/plain": "False" }, - "execution_count": 61, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -123,9 +91,13 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": { - "tags": [] + "tags": [], + "ExecuteTime": { + "end_time": "2023-09-09T08:02:23.144824Z", + "start_time": "2023-09-09T08:02:22.047801Z" + } }, "outputs": [], "source": [ @@ -138,8 +110,13 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2023-09-09T08:02:25.452472Z", + "start_time": "2023-09-09T08:02:25.441563Z" + } + }, "outputs": [], "source": [ "loader = TextLoader(\"../../../state_of_the_union.txt\")\n", @@ -152,8 +129,13 @@ }, { "cell_type": "code", - "execution_count": 3, - "metadata": {}, + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2023-09-09T08:02:28.174088Z", + "start_time": "2023-09-09T08:02:28.162698Z" + } + }, "outputs": [], "source": [ "# PGVector needs the connection string to the database.\n", @@ -174,15 +156,22 @@ }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## Similarity Search with Euclidean Distance (Default)" - ] + ], + "metadata": { + "collapsed": false + } }, { "cell_type": "code", - "execution_count": 16, - "metadata": {}, + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2023-09-09T08:04:16.696625Z", + "start_time": "2023-09-09T08:02:31.817790Z" + } + }, "outputs": [], "source": [ "# The PGVector Module will try to create a table with the name of the collection.\n", @@ -200,8 +189,13 @@ }, { "cell_type": "code", - "execution_count": 17, - "metadata": {}, + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2023-09-09T08:05:11.104135Z", + "start_time": "2023-09-09T08:05:10.548998Z" + } + }, "outputs": [], "source": [ "query = \"What did the president say about Ketanji Brown Jackson\"\n", @@ -210,15 +204,20 @@ }, { "cell_type": "code", - "execution_count": 18, - "metadata": {}, + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2023-09-09T08:05:13.532334Z", + "start_time": "2023-09-09T08:05:13.523191Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--------------------------------------------------------------------------------\n", - "Score: 0.18460171628856903\n", + "Score: 0.18456886638850434\n", "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", "\n", "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", @@ -228,17 +227,97 @@ "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", "--------------------------------------------------------------------------------\n", "--------------------------------------------------------------------------------\n", - "Score: 0.18460171628856903\n", - "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", + "Score: 0.21742627672631343\n", + "A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n", "\n", - "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", + "And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n", "\n", - "One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \n", + "We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \n", "\n", - "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", + "We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \n", + "\n", + "We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \n", + "\n", + "We’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.\n", "--------------------------------------------------------------------------------\n", "--------------------------------------------------------------------------------\n", - "Score: 0.18470284560586236\n", + "Score: 0.22641793174529334\n", + "And for our LGBTQ+ Americans, let’s finally get the bipartisan Equality Act to my desk. The onslaught of state laws targeting transgender Americans and their families is wrong. \n", + "\n", + "As I said last year, especially to our younger transgender Americans, I will always have your back as your President, so you can be yourself and reach your God-given potential. \n", + "\n", + "While it often appears that we never agree, that isn’t true. I signed 80 bipartisan bills into law last year. From preventing government shutdowns to protecting Asian-Americans from still-too-common hate crimes to reforming military justice. \n", + "\n", + "And soon, we’ll strengthen the Violence Against Women Act that I first wrote three decades ago. It is important for us to show the nation that we can come together and do big things. \n", + "\n", + "So tonight I’m offering a Unity Agenda for the Nation. Four big things we can do together. \n", + "\n", + "First, beat the opioid epidemic.\n", + "--------------------------------------------------------------------------------\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.22670040608054465\n", + "Tonight, I’m announcing a crackdown on these companies overcharging American businesses and consumers. \n", + "\n", + "And as Wall Street firms take over more nursing homes, quality in those homes has gone down and costs have gone up. \n", + "\n", + "That ends on my watch. \n", + "\n", + "Medicare is going to set higher standards for nursing homes and make sure your loved ones get the care they deserve and expect. \n", + "\n", + "We’ll also cut costs and keep the economy going strong by giving workers a fair shot, provide more training and apprenticeships, hire them based on their skills not degrees. \n", + "\n", + "Let’s pass the Paycheck Fairness Act and paid leave. \n", + "\n", + "Raise the minimum wage to $15 an hour and extend the Child Tax Credit, so no one has to raise a family in poverty. \n", + "\n", + "Let’s increase Pell Grants and increase our historic support of HBCUs, and invest in what Jill—our First Lady who teaches full-time—calls America’s best-kept secret: community colleges.\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "for doc, score in docs_with_score:\n", + " print(\"-\" * 80)\n", + " print(\"Score: \", score)\n", + " print(doc.page_content)\n", + " print(\"-\" * 80)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## Maximal Marginal Relevance Search (MMR)\n", + "Maximal marginal relevance optimizes for similarity to query AND diversity among selected documents." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 10, + "outputs": [], + "source": [ + "docs_with_score = db.max_marginal_relevance_search_with_score(query)" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-09-09T08:05:23.276819Z", + "start_time": "2023-09-09T08:05:21.972256Z" + } + } + }, + { + "cell_type": "code", + "execution_count": 11, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--------------------------------------------------------------------------------\n", + "Score: 0.18453882564037527\n", "Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \n", "\n", "Tonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \n", @@ -248,18 +327,68 @@ "And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.\n", "--------------------------------------------------------------------------------\n", "--------------------------------------------------------------------------------\n", - "Score: 0.21730864082247825\n", - "A former top litigator in private practice. A former federal public defender. And from a family of public school educators and police officers. A consensus builder. Since she’s been nominated, she’s received a broad range of support—from the Fraternal Order of Police to former judges appointed by Democrats and Republicans. \n", + "Score: 0.23523731441720075\n", + "We can’t change how divided we’ve been. But we can change how we move forward—on COVID-19 and other issues we must face together. \n", "\n", - "And if we are to advance liberty and justice, we need to secure the Border and fix the immigration system. \n", + "I recently visited the New York City Police Department days after the funerals of Officer Wilbert Mora and his partner, Officer Jason Rivera. \n", "\n", - "We can do both. At our border, we’ve installed new technology like cutting-edge scanners to better detect drug smuggling. \n", + "They were responding to a 9-1-1 call when a man shot and killed them with a stolen gun. \n", "\n", - "We’ve set up joint patrols with Mexico and Guatemala to catch more human traffickers. \n", + "Officer Mora was 27 years old. \n", "\n", - "We’re putting in place dedicated immigration judges so families fleeing persecution and violence can have their cases heard faster. \n", + "Officer Rivera was 22. \n", "\n", - "We’re securing commitments and supporting partners in South and Central America to host more refugees and secure their own borders.\n", + "Both Dominican Americans who’d grown up on the same streets they later chose to patrol as police officers. \n", + "\n", + "I spoke with their families and told them that we are forever in debt for their sacrifice, and we will carry on their mission to restore the trust and safety every community deserves. \n", + "\n", + "I’ve worked on these issues a long time. \n", + "\n", + "I know what works: Investing in crime preventionand community police officers who’ll walk the beat, who’ll know the neighborhood, and who can restore trust and safety.\n", + "--------------------------------------------------------------------------------\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.2448441215698569\n", + "One was stationed at bases and breathing in toxic smoke from “burn pits” that incinerated wastes of war—medical and hazard material, jet fuel, and more. \n", + "\n", + "When they came home, many of the world’s fittest and best trained warriors were never the same. \n", + "\n", + "Headaches. Numbness. Dizziness. \n", + "\n", + "A cancer that would put them in a flag-draped coffin. \n", + "\n", + "I know. \n", + "\n", + "One of those soldiers was my son Major Beau Biden. \n", + "\n", + "We don’t know for sure if a burn pit was the cause of his brain cancer, or the diseases of so many of our troops. \n", + "\n", + "But I’m committed to finding out everything we can. \n", + "\n", + "Committed to military families like Danielle Robinson from Ohio. \n", + "\n", + "The widow of Sergeant First Class Heath Robinson. \n", + "\n", + "He was born a soldier. Army National Guard. Combat medic in Kosovo and Iraq. \n", + "\n", + "Stationed near Baghdad, just yards from burn pits the size of football fields. \n", + "\n", + "Heath’s widow Danielle is here with us tonight. They loved going to Ohio State football games. He loved building Legos with their daughter.\n", + "--------------------------------------------------------------------------------\n", + "--------------------------------------------------------------------------------\n", + "Score: 0.2513994424701056\n", + "And I’m taking robust action to make sure the pain of our sanctions is targeted at Russia’s economy. And I will use every tool at our disposal to protect American businesses and consumers. \n", + "\n", + "Tonight, I can announce that the United States has worked with 30 other countries to release 60 Million barrels of oil from reserves around the world. \n", + "\n", + "America will lead that effort, releasing 30 Million barrels from our own Strategic Petroleum Reserve. And we stand ready to do more if necessary, unified with our allies. \n", + "\n", + "These steps will help blunt gas prices here at home. And I know the news about what’s happening can seem alarming. \n", + "\n", + "But I want you to know that we are going to be okay. \n", + "\n", + "When the history of this era is written Putin’s war on Ukraine will have left Russia weaker and the rest of the world stronger. \n", + "\n", + "While it shouldn’t have taken something so terrible for people around the world to see what’s at stake now everyone sees it clearly.\n", "--------------------------------------------------------------------------------\n" ] } @@ -270,7 +399,14 @@ " print(\"Score: \", score)\n", " print(doc.page_content)\n", " print(\"-\" * 80)" - ] + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-09-09T08:05:27.478580Z", + "start_time": "2023-09-09T08:05:27.470138Z" + } + } }, { "cell_type": "markdown", diff --git a/docs/extras/integrations/vectorstores/redis.ipynb b/docs/extras/integrations/vectorstores/redis.ipynb index ae17b0e4e6..4b13672fc5 100644 --- a/docs/extras/integrations/vectorstores/redis.ipynb +++ b/docs/extras/integrations/vectorstores/redis.ipynb @@ -10,9 +10,9 @@ "\n", "## What is Redis?\n", "\n", - "Most developers from a web services background are probably familiar with Redis. At it's core, Redis is an open-source key-value store that can be used as a cache, message broker, and database. Developers choice Redis because it is fast, has a large ecosystem of client libraries, and has been deployed by major enterprises for years.\n", + "Most developers from a web services background are probably familiar with Redis. At it's core, Redis is an open-source key-value store that can be used as a cache, message broker, and database. Developers choose Redis because it is fast, has a large ecosystem of client libraries, and has been deployed by major enterprises for years.\n", "\n", - "In addition to the traditional uses of Redis. Redis also provides capabilities built directly into Redis. These capabilities include the Search and Query capability that allows users to create secondary index structures within Redis. This allows Redis to be a Vector Database, at the speed of a cache. \n", + "On top of these traditional use cases, Redis provides additional capabilities like the Search and Query capability that allows users to create secondary index structures within Redis. This allows Redis to be a Vector Database, at the speed of a cache. \n", "\n", "\n", "## Redis as a Vector Database\n", @@ -123,7 +123,7 @@ "source": [ "## Install Redis Python Client\n", "\n", - "Redis-py is the officially supported client by Redis. Recently released is the RedisVL client which is purpose built for the Vector Database use cases. Both can be installed with pip." + "Redis-py is the officially supported client by Redis. Recently released is the RedisVL client which is purpose-built for the Vector Database use cases. Both can be installed with pip." ] }, { @@ -153,9 +153,17 @@ "import os\n", "import getpass\n", "\n", - "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")\n", - "\n", + "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "from langchain.embeddings import OpenAIEmbeddings\n", + "\n", "embeddings = OpenAIEmbeddings()" ] }, @@ -215,6 +223,12 @@ "source": [ "## Initializing Redis\n", "\n", + "To locally deploy Redis, run:\n", + "```console\n", + "docker run -d -p 6379:6379 -p 8001:8001 redis/redis-stack:latest\n", + "```\n", + "If things are running correctly you should see a nice Redis UI at http://localhost:8001. See the [Deployment Options](#deployment-options) section above for other ways to deploy.\n", + "\n", "The Redis VectorStore instance can be initialized in a number of ways. There are multiple class methods that can be used to initialize a Redis VectorStore instance.\n", "\n", "- ``Redis.__init__`` - Initialize directly\n", @@ -223,7 +237,7 @@ "- ``Redis.from_texts_return_keys`` - Initialize from a list of texts (optionally with metadata) and return the keys\n", "- ``Redis.from_existing_index`` - Initialize from an existing Redis index\n", "\n", - "Below we will use the ``Redis.from_documents`` method." + "Below we will use the ``Redis.from_texts`` method." ] }, { @@ -234,28 +248,12 @@ }, "outputs": [], "source": [ - "from langchain.vectorstores.redis import Redis" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you're not interested in the keys of your entries you can also create your redis instance from the documents." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.docstore.document import Document\n", + "from langchain.vectorstores.redis import Redis\n", "\n", - "documents = [Document(page_content=t, metadata=m) for t, m in zip(texts, metadata)]\n", - "rds = Redis.from_documents(\n", - " documents,\n", + "rds = Redis.from_texts(\n", + " texts,\n", " embeddings,\n", + " metadatas=metadats,\n", " redis_url=\"redis://localhost:6379\",\n", " index_name=\"users\"\n", ")" @@ -413,7 +411,8 @@ "- ``similarity_search``: Find the most similar vectors to a given vector.\n", "- ``similarity_search_with_score``: Find the most similar vectors to a given vector and return the vector distance\n", "- ``similarity_search_limit_score``: Find the most similar vectors to a given vector and limit the number of results to the ``score_threshold``\n", - "- ``similarity_search_with_relevance_scores``: Find the most similar vectors to a given vector and return the vector similarities" + "- ``similarity_search_with_relevance_scores``: Find the most similar vectors to a given vector and return the vector similarities\n", + "- ``max_marginal_relevance_search``: Find the most similar vectors to a given vector while also optimizing for diversity" ] }, { @@ -453,7 +452,7 @@ "results = rds.similarity_search(\"foo\", k=3)\n", "meta = results[1].metadata\n", "print(\"Key of the document in Redis: \", meta.pop(\"id\"))\n", - "print(\"Metadata of the document: \", meta)\n" + "print(\"Metadata of the document: \", meta)" ] }, { @@ -596,6 +595,26 @@ "print(results[0].metadata)" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# use maximal marginal relevance search to diversify results\n", + "results = rds.max_marginal_relevance_search(\"foo\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# the lambda_mult parameter controls the diversity of the results, the lower the more diverse\n", + "results = rds.max_marginal_relevance_search(\"foo\", lambda_mult=0.1)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -1208,7 +1227,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.11.3" } }, "nbformat": 4, diff --git a/docs/extras/integrations/vectorstores/sqlitevss.ipynb b/docs/extras/integrations/vectorstores/sqlitevss.ipynb new file mode 100644 index 0000000000..e670d5683f --- /dev/null +++ b/docs/extras/integrations/vectorstores/sqlitevss.ipynb @@ -0,0 +1,207 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "source": [ + "# sqlite-vss\n", + "\n", + ">[sqlite-vss](https://alexgarcia.xyz/sqlite-vss/) is an SQLite extension designed for vector search, emphasizing local-first operations and easy integration into applications without external servers. Leveraging the Faiss library, it offers efficient similarity search and clustering capabilities.\n", + "\n", + "This notebook shows how to use the `SQLiteVSS` vector database." + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "# You need to install sqlite-vss as a dependency.\n", + "%pip install sqlite-vss" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "markdown", + "source": [ + "### Quickstart" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 2, + "outputs": [ + { + "data": { + "text/plain": "'Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, I’d like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence.'" + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import SQLiteVSS\n", + "from langchain.document_loaders import TextLoader\n", + "\n", + "# load the document and split it into chunks\n", + "loader = TextLoader(\"../../../state_of_the_union.txt\")\n", + "documents = loader.load()\n", + "\n", + "# split it into chunks\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "texts = [doc.page_content for doc in docs]\n", + "\n", + "\n", + "# create the open-source embedding function\n", + "embedding_function = SentenceTransformerEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n", + "\n", + "\n", + "# load it in sqlite-vss in a table named state_union.\n", + "# the db_file parameter is the name of the file you want\n", + "# as your sqlite database.\n", + "db = SQLiteVSS.from_texts(\n", + " texts=texts,\n", + " embedding=embedding_function,\n", + " table=\"state_union\",\n", + " db_file=\"/tmp/vss.db\"\n", + ")\n", + "\n", + "# query it\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "data = db.similarity_search(query)\n", + "\n", + "# print results\n", + "data[0].page_content" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-09-06T14:55:55.370351Z", + "start_time": "2023-09-06T14:55:53.547755Z" + } + } + }, + { + "cell_type": "markdown", + "source": [ + "### Using existing sqlite connection" + ], + "metadata": { + "collapsed": false + } + }, + { + "cell_type": "code", + "execution_count": 7, + "outputs": [ + { + "data": { + "text/plain": "'Ketanji Brown Jackson is awesome'" + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import SQLiteVSS\n", + "from langchain.document_loaders import TextLoader\n", + "\n", + "# load the document and split it into chunks\n", + "loader = TextLoader(\"../../../state_of_the_union.txt\")\n", + "documents = loader.load()\n", + "\n", + "# split it into chunks\n", + "text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n", + "docs = text_splitter.split_documents(documents)\n", + "texts = [doc.page_content for doc in docs]\n", + "\n", + "\n", + "# create the open-source embedding function\n", + "embedding_function = SentenceTransformerEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n", + "connection = SQLiteVSS.create_connection(db_file=\"/tmp/vss.db\")\n", + "\n", + "db1 = SQLiteVSS(\n", + " table=\"state_union\",\n", + " embedding=embedding_function,\n", + " connection=connection\n", + ")\n", + "\n", + "db1.add_texts([\"Ketanji Brown Jackson is awesome\"])\n", + "# query it again\n", + "query = \"What did the president say about Ketanji Brown Jackson\"\n", + "data = db1.similarity_search(query)\n", + "\n", + "# print results\n", + "data[0].page_content" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-09-06T14:59:22.086252Z", + "start_time": "2023-09-06T14:59:21.693237Z" + } + } + }, + { + "cell_type": "code", + "execution_count": 13, + "outputs": [], + "source": [ + "# Cleaning up\n", + "import os\n", + "os.remove(\"/tmp/vss.db\")" + ], + "metadata": { + "collapsed": false, + "ExecuteTime": { + "end_time": "2023-09-06T15:01:15.550318Z", + "start_time": "2023-09-06T15:01:15.546428Z" + } + } + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [], + "metadata": { + "collapsed": false + } + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/docs/extras/integrations/vectorstores/supabase.ipynb b/docs/extras/integrations/vectorstores/supabase.ipynb index 9ba5dacd08..9a5f583adb 100644 --- a/docs/extras/integrations/vectorstores/supabase.ipynb +++ b/docs/extras/integrations/vectorstores/supabase.ipynb @@ -28,43 +28,41 @@ "The following function determines cosine similarity, but you can adjust to your needs.\n", "\n", "```sql\n", - " -- Enable the pgvector extension to work with embedding vectors\n", - " create extension vector;\n", + "-- Enable the pgvector extension to work with embedding vectors\n", + "create extension if not exists vector;\n", "\n", - " -- Create a table to store your documents\n", - " create table documents (\n", - " id uuid primary key,\n", - " content text, -- corresponds to Document.pageContent\n", - " metadata jsonb, -- corresponds to Document.metadata\n", - " embedding vector(1536) -- 1536 works for OpenAI embeddings, change if needed\n", - " );\n", + "-- Create a table to store your documents\n", + "create table\n", + " documents (\n", + " id uuid primary key,\n", + " content text, -- corresponds to Document.pageContent\n", + " metadata jsonb, -- corresponds to Document.metadata\n", + " embedding vector (1536) -- 1536 works for OpenAI embeddings, change if needed\n", + " );\n", "\n", - " CREATE FUNCTION match_documents(query_embedding vector(1536), match_count int)\n", - " RETURNS TABLE(\n", - " id uuid,\n", - " content text,\n", - " metadata jsonb,\n", - " -- we return matched vectors to enable maximal marginal relevance searches\n", - " embedding vector(1536),\n", - " similarity float)\n", - " LANGUAGE plpgsql\n", - " AS $$\n", - " # variable_conflict use_column\n", - " BEGIN\n", - " RETURN query\n", - " SELECT\n", - " id,\n", - " content,\n", - " metadata,\n", - " embedding,\n", - " 1 -(documents.embedding <=> query_embedding) AS similarity\n", - " FROM\n", - " documents\n", - " ORDER BY\n", - " documents.embedding <=> query_embedding\n", - " LIMIT match_count;\n", - " END;\n", - " $$;\n", + "-- Create a function to search for documents\n", + "create function match_documents (\n", + " query_embedding vector (1536),\n", + " filter jsonb default '{}'\n", + ") returns table (\n", + " id uuid,\n", + " content text,\n", + " metadata jsonb,\n", + " similarity float\n", + ") language plpgsql as $$\n", + "#variable_conflict use_column\n", + "begin\n", + " return query\n", + " select\n", + " id,\n", + " content,\n", + " metadata,\n", + " 1 - (documents.embedding <=> query_embedding) as similarity\n", + " from documents\n", + " where metadata @> filter\n", + " order by documents.embedding <=> query_embedding;\n", + "end;\n", + "$$;\n", "```" ] }, diff --git a/docs/extras/integrations/vectorstores/vearch.ipynb b/docs/extras/integrations/vectorstores/vearch.ipynb new file mode 100644 index 0000000000..8e14c12369 --- /dev/null +++ b/docs/extras/integrations/vectorstores/vearch.ipynb @@ -0,0 +1,413 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/export/anaconda3/envs/langchainGLM6B/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "INFO 2023-08-28 18:26:07,485-1d: \n", + "loading model config\n", + "llm device: cuda\n", + "embedding device: cuda\n", + "dir: /data/zhx/zhx/langchain-ChatGLM_new\n", + "flagging username: e2fc35b8e87c4de18d692e951a5f7c46\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "True\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading checkpoint shards: 100%|██████████| 7/7 [00:06<00:00, 1.01it/s]\n" + ] + } + ], + "source": [ + "\n", + "import os, sys, torch\n", + "from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel\n", + "from langchain import HuggingFacePipeline, ConversationChain\n", + "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", + "from langchain.vectorstores.vearch import VearchDb\n", + "from langchain.document_loaders import TextLoader\n", + "from langchain.prompts import PromptTemplate\n", + "from langchain.chains import RetrievalQA\n", + "from langchain.embeddings.huggingface import HuggingFaceEmbeddings\n", + "\n", + "# your local model path\n", + "model_path =\"/data/zhx/zhx/langchain-ChatGLM_new/chatglm2-6b\" \n", + "\n", + "tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)\n", + "model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Human: 你好!\n", + "ChatGLM:你好👋!我是人工智能助手 ChatGLM2-6B,很高兴见到你,欢迎问我任何问题。\n", + "\n", + "Human: 你知道凌波微步吗,你知道都有谁学会了吗?\n", + "ChatGLM:凌波微步是一种步伐,最早出自于《倚天屠龙记》。在小说中,灭绝师太曾因与练习凌波微步的杨过的恩怨纠葛,而留下了一部经书,内容是记载凌波微步的起源和作用。后来,凌波微步便成为杨过和小龙女的感情象征。在现实生活中,凌波微步是一句口号,是清华大学学生社团“模型社”的社训。\n", + "\n" + ] + } + ], + "source": [ + "query = \"你好!\"\n", + "response, history = model.chat(tokenizer, query, history=[])\n", + "print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n", + "query = \"你知道凌波微步吗,你知道都有谁学会了吗?\"\n", + "response, history = model.chat(tokenizer, query, history=history)\n", + "print(f\"Human: {query}\\nChatGLM:{response}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO 2023-08-28 18:27:36,037-1d: Load pretrained SentenceTransformer: /data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese\n", + "WARNING 2023-08-28 18:27:36,038-1d: No sentence-transformers model found with name /data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese. Creating a new one with MEAN pooling.\n", + "INFO 2023-08-28 18:27:38,936-1d: Use pytorch device: cuda\n" + ] + } + ], + "source": [ + "# Add your local knowledge files\n", + "file_path = \"/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt\"#Your local file path\"\n", + "loader = TextLoader(file_path,encoding=\"utf-8\")\n", + "documents = loader.load()\n", + "\n", + "# split text into sentences and embedding the sentences\n", + "text_splitter = RecursiveCharacterTextSplitter(\n", + " chunk_size=500, chunk_overlap=100)\n", + "texts = text_splitter.split_documents(documents)\n", + "\n", + "#your model path\n", + "embedding_path = '/data/zhx/zhx/langchain-ChatGLM_new/text2vec/text2vec-large-chinese'\n", + "embeddings = HuggingFaceEmbeddings(model_name=embedding_path)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|██████████| 1/1 [00:00<00:00, 4.56it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['7aae36236f784105a0004d8ff3c7c3ad', '7e495d4e5962497db2080e84d52e75ed', '9a640124fc324a8abb0eaa31acb638b7']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "#first add your document into vearch vectorstore\n", + "vearch_db = VearchDb.from_documents(texts,embeddings,table_name=\"your_table_name\",metadata_path=\"/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/your_table_name\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|██████████| 1/1 [00:00<00:00, 22.49it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "####################第1段相关文档####################\n", + "\n", + "午饭过后,段誉又练“凌波微步”,走一步,吸一口气,走第二步时将气呼出,六十四卦走完,四肢全无麻痹之感,料想呼吸顺畅,便无害处。第二次再走时连走两步吸一口气,再走两步始行呼出。这“凌波微步”是以动功修习内功,脚步踏遍六十四卦一个周天,内息自然而然地也转了一个周天。因此他每走一遍,内力便有一分进益。\n", + "\n", + "这般练了几天,“凌波微步”已走得颇为纯熟,不须再数呼吸,纵然疾行,气息也已无所窒滞。心意既畅,跨步时渐渐想到《洛神赋》中那些与“凌波微步”有关的句子:“仿佛兮若轻云之蔽月,飘飘兮若流风之回雪”,“竦轻躯以鹤立,若将飞而未翔”,“体迅飞凫,飘忽若神”,“动无常则,若危若安。进止难期,若往若还”。\n", + "\n", + "\n", + "\n", + "百度简介\n", + "\n", + "凌波微步是「逍遥派」独门轻功身法,精妙异常。\n", + "\n", + "凌波微步乃是一门极上乘的轻功,所以列于卷轴之末,以易经八八六十四卦为基础,使用者按特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。此步法精妙异常,原是要待人练成「北冥神功」,吸人内力,自身内力已【颇为深厚】之后再练。\n", + "\n", + "####################第2段相关文档####################\n", + "\n", + "《天龙八部》第五回 微步縠纹生\n", + "\n", + "卷轴中此外诸种经脉修习之法甚多,皆是取人内力的法门,段誉虽自语宽解,总觉习之有违本性,单是贪多务得,便非好事,当下暂不理会。\n", + "\n", + "卷到卷轴末端,又见到了“凌波微步”那四字,登时便想起《洛神赋》中那些句子来:“凌波微步,罗袜生尘……转眄流精,光润玉颜。含辞未吐,气若幽兰。华容婀娜,令我忘餐。”曹子建那些千古名句,在脑海中缓缓流过:“秾纤得衷,修短合度,肩若削成,腰如约素。延颈秀项,皓质呈露。芳泽无加,铅华弗御。云髻峨峨,修眉连娟。丹唇外朗,皓齿内鲜。明眸善睐,靥辅承权。瑰姿艳逸,仪静体闲。柔情绰态,媚于语言……”这些句子用在木婉清身上,“这话倒也有理”;但如用之于神仙姊姊,只怕更为适合。想到神仙姊姊的姿容体态,“皎若太阳升朝霞,灼若芙蓉出绿波”,但觉依她吩咐行事,实为人生至乐,心想:“我先来练这‘凌波微步’,此乃逃命之妙法,非害人之手段也,练之有百利而无一害。”\n", + "\n", + "####################第3段相关文档####################\n", + "\n", + "《天龙八部》第二回 玉壁月华明\n", + "\n", + "再展帛卷,长卷上源源皆是裸女画像,或立或卧,或现前胸,或见后背。人像的面容都是一般,但或喜或愁,或含情凝眸,或轻嗔薄怒,神情各异。一共有三十六幅图像,每幅像上均有颜色细线,注明穴道部位及练功法诀。\n", + "\n", + "帛卷尽处题着“凌波微步”四字,其后绘的是无数足印,注明“妇妹”、“无妄”等等字样,尽是《易经》中的方位。段誉前几日还正全心全意地钻研《易经》,一见到这些名称,登时精神大振,便似遇到故交良友一般。只见足印密密麻麻,不知有几千百个,自一个足印至另一个足印均有绿线贯串,线上绘有箭头,最后写着一行字道:“步法神妙,保身避敌,待积内力,再取敌命。”\n", + "\n", + "段誉心道:“神仙姊姊所遗的步法,必定精妙之极,遇到强敌时脱身逃走,那就很好,‘再取敌命’也就不必了。”\n", + "卷好帛卷,对之作了两个揖,珍而重之地揣入怀中,转身对那玉像道:“神仙姊姊,你吩咐我朝午晚三次练功,段誉不敢有违。今后我对人加倍客气,别人不会来打我,我自然也不会去吸他内力。你这套‘凌波微步’我更要用心练熟,眼见不对,立刻溜之大吉,就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节,却想也不敢去想。\n", + "\n", + "********ChatGLM:凌波微步是一种轻功身法,属于逍遥派独门轻功。它以《易经》中的六十四卦为基础,按照特定顺序踏着卦象方位行进,从第一步到最后一步正好行走一个大圈。凌波微步精妙异常,可以让人内力相助,自身内力颇为深厚之后再练。《天龙八部》第五回中有描述。\n", + "\n" + ] + } + ], + "source": [ + "\n", + "res=vearch_db.similarity_search(query, 3)\n", + "query = \"你知道凌波微步吗,你知道都有谁会凌波微步?\"\n", + "for idx,tmp in enumerate(res): \n", + " print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n", + "\n", + "# combine your local knowleadge and query \n", + "context = \"\".join([tmp.page_content for tmp in res])\n", + "new_query = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context} \\n 回答用户这个问题:{query}\\n\\n\"\n", + "response, history = model.chat(tokenizer, new_query, history=[])\n", + "print(f\"********ChatGLM:{response}\\n\")\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Human: 你知道vearch是什么吗?\n", + "ChatGLM:是的,我知道 Vearch。Vearch 是一种矩阵分解 technique,用于将矩阵分解为若干个不可约矩阵的乘积。它是由 Linus Torvalds 开发的,旨在提高 Linux 内核中矩阵操作的性能。\n", + "\n", + "Vearch 可以通过使用特殊的操作来对矩阵进行操作,从而避免了使用昂贵的矩阵操作库。它也被广泛用于其他操作系统中,如 FreeBSD 和 Solaris。\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|██████████| 1/1 [00:00<00:00, 31.59it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['04bc84fff5074b7b8990441e92e6df07', 'e221906153bb4e03bc7095dadea144de', '126034ba51934093920d8732860f340b']\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "data": { + "text/plain": [ + "['04bc84fff5074b7b8990441e92e6df07',\n", + " 'e221906153bb4e03bc7095dadea144de',\n", + " '126034ba51934093920d8732860f340b']" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"你知道vearch是什么吗?\"\n", + "response, history = model.chat(tokenizer, query, history=history)\n", + "print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n", + "\n", + "\n", + "vearch_info = [\"Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用\",\n", + " \"Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库\",\n", + " \"vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装\"]\n", + "vearch_source=[{'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},{'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'},{'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/tlbb/three_body.txt'}]\n", + "vearch_db.add_texts(vearch_info,vearch_source)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Batches: 100%|██████████| 1/1 [00:00<00:00, 25.57it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "####################第1段相关文档####################\n", + "\n", + "Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用\n", + "\n", + "####################第2段相关文档####################\n", + "\n", + "Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库\n", + "\n", + "####################第3段相关文档####################\n", + "\n", + "vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装\n", + "\n", + "***************ChatGLM:是的,Varch是一个向量数据库,旨在存储和快速搜索模型embedding后的向量。它支持OpenAI、Llama和ChatGLM等模型,并可以直接通过pip安装。Varch是一个基于C语言和Go语言开发的项目,并提供了Python接口。\n", + "\n" + ] + } + ], + "source": [ + "query3 = \"你知道vearch是什么吗?\"\n", + "res1 = vearch_db.similarity_search(query3, 3)\n", + "for idx,tmp in enumerate(res1): \n", + " print(f\"{'#'*20}第{idx+1}段相关文档{'#'*20}\\n\\n{tmp.page_content}\\n\")\n", + "\n", + "context1 = \"\".join([tmp.page_content for tmp in res1])\n", + "new_query1 = f\"基于以下信息,尽可能准确的来回答用户的问题。背景信息:\\n {context1} \\n 回答用户这个问题:{query3}\\n\\n\"\n", + "response, history = model.chat(tokenizer, new_query1, history=[])\n", + "\n", + "print(f\"***************ChatGLM:{response}\\n\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "delete docid True\n", + "Human: 你知道vearch是什么吗?\n", + "ChatGLM:Vearch是一种高分子化合物,也称为聚合物、高分子材料或合成材料。它是由重复单元组成的大型聚合物,通常由一些重复单元组成,这些单元在聚合过程中结合在一起形成一个连续的高分子链。\n", + "\n", + "Vearch具有许多独特的性质,例如高强度、高刚性、耐磨、耐腐蚀、耐高温等。它们通常用于制造各种应用,例如塑料制品、橡胶、纤维、建筑材料等。\n", + "\n", + "after delete docid to query again: {}\n", + "get existed docid {'7aae36236f784105a0004d8ff3c7c3ad': Document(page_content='《天龙八部》第二回 玉壁月华明\\n\\n再展帛卷,长卷上源源皆是裸女画像,或立或卧,或现前胸,或见后背。人像的面容都是一般,但或喜或愁,或含情凝眸,或轻嗔薄怒,神情各异。一共有三十六幅图像,每幅像上均有颜色细线,注明穴道部位及练功法诀。\\n\\n帛卷尽处题着“凌波微步”四字,其后绘的是无数足印,注明“妇妹”、“无妄”等等字样,尽是《易经》中的方位。段誉前几日还正全心全意地钻研《易经》,一见到这些名称,登时精神大振,便似遇到故交良友一般。只见足印密密麻麻,不知有几千百个,自一个足印至另一个足印均有绿线贯串,线上绘有箭头,最后写着一行字道:“步法神妙,保身避敌,待积内力,再取敌命。”\\n\\n段誉心道:“神仙姊姊所遗的步法,必定精妙之极,遇到强敌时脱身逃走,那就很好,‘再取敌命’也就不必了。”\\n卷好帛卷,对之作了两个揖,珍而重之地揣入怀中,转身对那玉像道:“神仙姊姊,你吩咐我朝午晚三次练功,段誉不敢有违。今后我对人加倍客气,别人不会来打我,我自然也不会去吸他内力。你这套‘凌波微步’我更要用心练熟,眼见不对,立刻溜之大吉,就吸不到他内力了。”至于“杀尽我逍遥派弟子”一节,却想也不敢去想。', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'}), '7e495d4e5962497db2080e84d52e75ed': Document(page_content='《天龙八部》第五回 微步縠纹生\\n\\n卷轴中此外诸种经脉修习之法甚多,皆是取人内力的法门,段誉虽自语宽解,总觉习之有违本性,单是贪多务得,便非好事,当下暂不理会。\\n\\n卷到卷轴末端,又见到了“凌波微步”那四字,登时便想起《洛神赋》中那些句子来:“凌波微步,罗袜生尘……转眄流精,光润玉颜。含辞未吐,气若幽兰。华容婀娜,令我忘餐。”曹子建那些千古名句,在脑海中缓缓流过:“秾纤得衷,修短合度,肩若削成,腰如约素。延颈秀项,皓质呈露。芳泽无加,铅华弗御。云髻峨峨,修眉连娟。丹唇外朗,皓齿内鲜。明眸善睐,靥辅承权。瑰姿艳逸,仪静体闲。柔情绰态,媚于语言……”这些句子用在木婉清身上,“这话倒也有理”;但如用之于神仙姊姊,只怕更为适合。想到神仙姊姊的姿容体态,“皎若太阳升朝霞,灼若芙蓉出绿波”,但觉依她吩咐行事,实为人生至乐,心想:“我先来练这‘凌波微步’,此乃逃命之妙法,非害人之手段也,练之有百利而无一害。”', metadata={'source': '/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/天龙八部/lingboweibu.txt'})}\n" + ] + } + ], + "source": [ + "##delete and get function need to maintian docids \n", + "##your docid\n", + "res_d=vearch_db.delete(['04bc84fff5074b7b8990441e92e6df07', 'e221906153bb4e03bc7095dadea144de', '126034ba51934093920d8732860f340b'])\n", + "print(\"delete docid\",res_d)\n", + "query = \"你知道vearch是什么吗?\"\n", + "response, history = model.chat(tokenizer, query, history=[])\n", + "print(f\"Human: {query}\\nChatGLM:{response}\\n\")\n", + "get_id_doc=vearch_db.get(['04bc84fff5074b7b8990441e92e6df07'])\n", + "print(\"after delete docid to query again:\",get_id_doc)\n", + "get_delet_doc=vearch_db.get(['7aae36236f784105a0004d8ff3c7c3ad', '7e495d4e5962497db2080e84d52e75ed'])\n", + "print(\"get existed docid\",get_delet_doc)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3.10.12 ('langchainGLM6B')", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "1fd24e7ef183310e43cbf656d21568350c6a30580b6df7fe3b34654b3770f74d" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/docs/extras/integrations/vectorstores/vectara.ipynb b/docs/extras/integrations/vectorstores/vectara.ipynb index 0741c1b199..e95504860b 100644 --- a/docs/extras/integrations/vectorstores/vectara.ipynb +++ b/docs/extras/integrations/vectorstores/vectara.ipynb @@ -26,7 +26,7 @@ "source": [ "# Setup\n", "\n", - "You will need a Vectara account to use Vectara with LangChain. To get started, use the following steps:\n", + "You will need a Vectara account to use Vectara with LangChain. To get started, use the following steps (see our [quickstart](https://docs.vectara.com/docs/quickstart) guide):\n", "1. [Sign up](https://console.vectara.com/signup) for a Vectara account if you don't already have one. Once you have completed your sign up you will have a Vectara customer ID. You can find your customer ID by clicking on your name, on the top-right of the Vectara console window.\n", "2. Within your account you can create one or more corpora. Each corpus represents an area that stores text data upon ingest from input documents. To create a corpus, use the **\"Create Corpus\"** button. You then provide a name to your corpus as well as a description. Optionally you can define filtering attributes and apply some advanced options. If you click on your created corpus, you can see its name and corpus ID right on the top.\n", "3. Next you'll need to create API keys to access the corpus. Click on the **\"Authorization\"** tab in the corpus view and then the **\"Create API Key\"** button. Give your key a name, and choose whether you want query only or query+index for your key. Click \"Create\" and you now have an active API key. Keep this key confidential. \n", @@ -47,7 +47,7 @@ "os.environ[\"VECTARA_API_KEY\"] = getpass.getpass(\"Vectara API Key:\")\n", "```\n", "\n", - "2. Add them to the Vectara vectorstore constructor:\n", + "1. Provide them as arguments when creating the Vectara vectorstore object:\n", "\n", "```python\n", "vectorstore = Vectara(\n", @@ -65,13 +65,22 @@ "source": [ "## Connecting to Vectara from LangChain\n", "\n", - "To get started, let's ingest the documents using the from_documents() method.\n", - "We assume here that you've added your VECTARA_CUSTOMER_ID, VECTARA_CORPUS_ID and query+indexing VECTARA_API_KEY as environment variables." + "In this example, we assume that you've created an account and a corpus, and added your VECTARA_CUSTOMER_ID, VECTARA_CORPUS_ID and VECTARA_API_KEY (created with permissions for both indexing and query) as environment variables.\n", + "\n", + "The corpus has 3 fields defined as metadata for filtering:\n", + "* url: a string field containing the source URL of the document (where relevant)\n", + "* speech: a string field containing the name of the speech\n", + "* author: the name of the author\n", + "\n", + "Let's start by ingesting 3 documents into the corpus:\n", + "1. The State of the Union speech from 2022, available in the LangChain repository as a text file\n", + "2. The \"I have a dream\" speech by Dr. Kind\n", + "3. The \"We shall Fight on the Beaches\" speech by Winston Churchil" ] }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "04a1f1a0", "metadata": {}, "outputs": [], @@ -79,12 +88,17 @@ "from langchain.embeddings import FakeEmbeddings\n", "from langchain.text_splitter import CharacterTextSplitter\n", "from langchain.vectorstores import Vectara\n", - "from langchain.document_loaders import TextLoader" + "from langchain.document_loaders import TextLoader\n", + "\n", + "from langchain.llms import OpenAI\n", + "from langchain.chains import ConversationalRetrievalChain\n", + "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", + "from langchain.chains.query_constructor.base import AttributeInfo" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "be0a4973", "metadata": {}, "outputs": [], @@ -97,7 +111,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "id": "8429667e", "metadata": { "ExecuteTime": { @@ -111,7 +125,7 @@ "vectara = Vectara.from_documents(\n", " docs,\n", " embedding=FakeEmbeddings(size=768),\n", - " doc_metadata={\"speech\": \"state-of-the-union\"},\n", + " doc_metadata={\"speech\": \"state-of-the-union\", \"author\": \"Biden\"},\n", ")" ] }, @@ -130,7 +144,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "id": "85ef3468", "metadata": {}, "outputs": [], @@ -142,14 +156,16 @@ " [\n", " \"https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf\",\n", " \"I-have-a-dream\",\n", + " \"Dr. King\"\n", " ],\n", " [\n", " \"https://www.parkwayschools.net/cms/lib/MO01931486/Centricity/Domain/1578/Churchill_Beaches_Speech.pdf\",\n", " \"we shall fight on the beaches\",\n", + " \"Churchil\"\n", " ],\n", "]\n", "files_list = []\n", - "for url, _ in urls:\n", + "for url, _, _ in urls:\n", " name = tempfile.NamedTemporaryFile().name\n", " urllib.request.urlretrieve(url, name)\n", " files_list.append(name)\n", @@ -157,7 +173,7 @@ "docsearch: Vectara = Vectara.from_files(\n", " files=files_list,\n", " embedding=FakeEmbeddings(size=768),\n", - " metadatas=[{\"url\": url, \"speech\": title} for url, title in urls],\n", + " metadatas=[{\"url\": url, \"speech\": title, \"author\": author} for url, title, author in urls],\n", ")" ] }, @@ -178,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 6, "id": "a8c513ab", "metadata": { "ExecuteTime": { @@ -197,7 +213,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "id": "fc516993", "metadata": { "ExecuteTime": { @@ -231,7 +247,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "id": "8804a21d", "metadata": { "ExecuteTime": { @@ -249,7 +265,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 9, "id": "756a6887", "metadata": { "ExecuteTime": { @@ -264,7 +280,7 @@ "text": [ "Justice Breyer, thank you for your service. One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. A former top litigator in private practice.\n", "\n", - "Score: 0.786569\n" + "Score: 0.8299499\n" ] } ], @@ -284,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 10, "id": "47784de5", "metadata": {}, "outputs": [ @@ -307,7 +323,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 11, "id": "3e22949f", "metadata": {}, "outputs": [ @@ -315,7 +331,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "With this threshold of 0.2 we have 3 documents\n" + "With this threshold of 0.2 we have 5 documents\n" ] } ], @@ -340,7 +356,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 12, "id": "9427195f", "metadata": { "ExecuteTime": { @@ -352,10 +368,10 @@ { "data": { "text/plain": [ - "VectaraRetriever(tags=['Vectara'], metadata=None, vectorstore=, search_type='similarity', search_kwargs={'lambda_val': 0.025, 'k': 5, 'filter': '', 'n_sentence_context': '2'})" + "VectaraRetriever(tags=['Vectara'], metadata=None, vectorstore=, search_type='similarity', search_kwargs={'lambda_val': 0.025, 'k': 5, 'filter': '', 'n_sentence_context': '2'})" ] }, - "execution_count": 11, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -367,7 +383,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 13, "id": "f3c70c31", "metadata": { "ExecuteTime": { @@ -379,10 +395,10 @@ { "data": { "text/plain": [ - "Document(page_content='Justice Breyer, thank you for your service. One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. A former top litigator in private practice.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '596', 'len': '97', 'speech': 'state-of-the-union'})" + "Document(page_content='Justice Breyer, thank you for your service. One of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. And I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nation’s top legal minds, who will continue Justice Breyer’s legacy of excellence. A former top litigator in private practice.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '596', 'len': '97', 'speech': 'state-of-the-union', 'author': 'Biden'})" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -392,10 +408,118 @@ "retriever.get_relevant_documents(query)[0]" ] }, + { + "cell_type": "markdown", + "id": "e944c26a", + "metadata": {}, + "source": [ + "## Using Vectara as a SelfQuery Retriever" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "8be674de", + "metadata": {}, + "outputs": [], + "source": [ + "metadata_field_info = [\n", + " AttributeInfo(\n", + " name=\"speech\",\n", + " description=\"what name of the speech\",\n", + " type=\"string or list[string]\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"author\",\n", + " description=\"author of the speech\",\n", + " type=\"string or list[string]\",\n", + " ),\n", + "]\n", + "document_content_description = \"the text of the speech\"\n", + "\n", + "vectordb = Vectara()\n", + "llm = OpenAI(temperature=0)\n", + "retriever = SelfQueryRetriever.from_llm(llm, vectara, \n", + " document_content_description, metadata_field_info, \n", + " verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "f8938999", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ofer/dev/langchain/libs/langchain/langchain/chains/llm.py:278: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='freedom' filter=Comparison(comparator=, attribute='author', value='Biden') limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Well I know this nation. We will meet the test. To protect freedom and liberty, to expand fairness and opportunity. We will save democracy. As hard as these times have been, I am more optimistic about America today than I have been my whole life.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '346', 'len': '67', 'speech': 'state-of-the-union', 'author': 'Biden'}),\n", + " Document(page_content='To our fellow Ukrainian Americans who forge a deep bond that connects our two nations we stand with you. Putin may circle Kyiv with tanks, but he will never gain the hearts and souls of the Ukrainian people. He will never extinguish their love of freedom. He will never weaken the resolve of the free world. We meet tonight in an America that has lived through two of the hardest years this nation has ever faced.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '740', 'len': '47', 'speech': 'state-of-the-union', 'author': 'Biden'}),\n", + " Document(page_content='But most importantly as Americans. With a duty to one another to the American people to the Constitution. And with an unwavering resolve that freedom will always triumph over tyranny. Six days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '413', 'len': '77', 'speech': 'state-of-the-union', 'author': 'Biden'}),\n", + " Document(page_content='We can do this. \\n\\nMy fellow Americans—tonight , we have gathered in a sacred space—the citadel of our democracy. In this Capitol, generation after generation, Americans have debated great questions amid great strife, and have done great things. We have fought for freedom, expanded liberty, defeated totalitarianism and terror. And built the strongest, freest, and most prosperous nation the world has ever known. Now is the hour. \\n\\nOur moment of responsibility.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '906', 'len': '82', 'speech': 'state-of-the-union', 'author': 'Biden'}),\n", + " Document(page_content='In state after state, new laws have been passed, not only to suppress the vote, but to subvert entire elections. We cannot let this happen. Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while you’re at it, pass the Disclose Act so Americans can know who is funding our elections.', metadata={'source': 'langchain', 'lang': 'eng', 'offset': '0', 'len': '63', 'speech': 'state-of-the-union', 'author': 'Biden'})]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "retriever.get_relevant_documents(\"what did Biden say about the freedom?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "a97037fb", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='freedom' filter=Comparison(comparator=, attribute='author', value='Dr. King') limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='And if America is to be a great nation, this must become true. So\\nlet freedom ring from the prodigious hilltops of New Hampshire. Let freedom ring from the mighty\\nmountains of New York. Let freedom ring from the heightening Alleghenies of Pennsylvania. Let\\nfreedom ring from the snowcapped Rockies of Colorado.', metadata={'lang': 'eng', 'section': '3', 'offset': '1534', 'len': '55', 'CreationDate': '1424880481', 'Producer': 'Adobe PDF Library 10.0', 'Author': 'Sasha Rolon-Pereira', 'Title': 'Martin Luther King Jr.pdf', 'Creator': 'Acrobat PDFMaker 10.1 for Word', 'ModDate': '1424880524', 'url': 'https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf', 'speech': 'I-have-a-dream', 'author': 'Dr. King', 'title': 'Martin Luther King Jr.pdf'}),\n", + " Document(page_content='And if America is to be a great nation, this must become true. So\\nlet freedom ring from the prodigious hilltops of New Hampshire. Let freedom ring from the mighty\\nmountains of New York. Let freedom ring from the heightening Alleghenies of Pennsylvania. Let\\nfreedom ring from the snowcapped Rockies of Colorado.', metadata={'lang': 'eng', 'section': '3', 'offset': '1534', 'len': '55', 'CreationDate': '1424880481', 'Producer': 'Adobe PDF Library 10.0', 'Author': 'Sasha Rolon-Pereira', 'Title': 'Martin Luther King Jr.pdf', 'Creator': 'Acrobat PDFMaker 10.1 for Word', 'ModDate': '1424880524', 'url': 'https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf', 'speech': 'I-have-a-dream', 'author': 'Dr. King', 'title': 'Martin Luther King Jr.pdf'}),\n", + " Document(page_content='Let freedom ring from the curvaceous slopes of\\nCalifornia. But not only that. Let freedom ring from Stone Mountain of Georgia. Let freedom ring from Lookout\\nMountain of Tennessee. Let freedom ring from every hill and molehill of Mississippi, from every\\nmountain side. Let freedom ring . . .\\nWhen we allow freedom to ring—when we let it ring from every city and every hamlet, from every state\\nand every city, we will be able to speed up that day when all of God’s children, black men and white\\nmen, Jews and Gentiles, Protestants and Catholics, will be able to join hands and sing in the words of the\\nold Negro spiritual, “Free at last, Free at last, Great God a-mighty, We are free at last.”', metadata={'lang': 'eng', 'section': '3', 'offset': '1842', 'len': '52', 'CreationDate': '1424880481', 'Producer': 'Adobe PDF Library 10.0', 'Author': 'Sasha Rolon-Pereira', 'Title': 'Martin Luther King Jr.pdf', 'Creator': 'Acrobat PDFMaker 10.1 for Word', 'ModDate': '1424880524', 'url': 'https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf', 'speech': 'I-have-a-dream', 'author': 'Dr. King', 'title': 'Martin Luther King Jr.pdf'}),\n", + " Document(page_content='Let freedom ring from the curvaceous slopes of\\nCalifornia. But not only that. Let freedom ring from Stone Mountain of Georgia. Let freedom ring from Lookout\\nMountain of Tennessee. Let freedom ring from every hill and molehill of Mississippi, from every\\nmountain side. Let freedom ring . . .\\nWhen we allow freedom to ring—when we let it ring from every city and every hamlet, from every state\\nand every city, we will be able to speed up that day when all of God’s children, black men and white\\nmen, Jews and Gentiles, Protestants and Catholics, will be able to join hands and sing in the words of the\\nold Negro spiritual, “Free at last, Free at last, Great God a-mighty, We are free at last.”', metadata={'lang': 'eng', 'section': '3', 'offset': '1842', 'len': '52', 'CreationDate': '1424880481', 'Producer': 'Adobe PDF Library 10.0', 'Author': 'Sasha Rolon-Pereira', 'Title': 'Martin Luther King Jr.pdf', 'Creator': 'Acrobat PDFMaker 10.1 for Word', 'ModDate': '1424880524', 'url': 'https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf', 'speech': 'I-have-a-dream', 'author': 'Dr. King', 'title': 'Martin Luther King Jr.pdf'}),\n", + " Document(page_content='Let freedom ring from the mighty\\nmountains of New York. Let freedom ring from the heightening Alleghenies of Pennsylvania. Let\\nfreedom ring from the snowcapped Rockies of Colorado. Let freedom ring from the curvaceous slopes of\\nCalifornia. But not only that. Let freedom ring from Stone Mountain of Georgia.', metadata={'lang': 'eng', 'section': '3', 'offset': '1657', 'len': '57', 'CreationDate': '1424880481', 'Producer': 'Adobe PDF Library 10.0', 'Author': 'Sasha Rolon-Pereira', 'Title': 'Martin Luther King Jr.pdf', 'Creator': 'Acrobat PDFMaker 10.1 for Word', 'ModDate': '1424880524', 'url': 'https://www.gilderlehrman.org/sites/default/files/inline-pdfs/king.dreamspeech.excerpts.pdf', 'speech': 'I-have-a-dream', 'author': 'Dr. King', 'title': 'Martin Luther King Jr.pdf'})]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "retriever.get_relevant_documents(\"what did Dr. King say about the freedom?\")" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "2300e785", + "id": "f6d17e90", "metadata": {}, "outputs": [], "source": [] diff --git a/docs/extras/modules/agents/agent_types/xml_agent.ipynb b/docs/extras/modules/agents/agent_types/xml_agent.ipynb index ed183d0467..251c94c171 100644 --- a/docs/extras/modules/agents/agent_types/xml_agent.ipynb +++ b/docs/extras/modules/agents/agent_types/xml_agent.ipynb @@ -141,7 +141,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.1" } }, "nbformat": 4, diff --git a/docs/extras/modules/data_connection/retrievers/self_query/redis_self_query.ipynb b/docs/extras/modules/data_connection/retrievers/self_query/redis_self_query.ipynb new file mode 100644 index 0000000000..d74ea2dd68 --- /dev/null +++ b/docs/extras/modules/data_connection/retrievers/self_query/redis_self_query.ipynb @@ -0,0 +1,472 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "13afcae7", + "metadata": {}, + "source": [ + "# Redis self-querying \n", + "\n", + ">[Redis](https://redis.com) is an open-source key-value store that can be used as a cache, message broker, database, vector database and more.\n", + "\n", + "In the notebook we'll demo the `SelfQueryRetriever` wrapped around a Redis vector store. " + ] + }, + { + "cell_type": "markdown", + "id": "68e75fb9", + "metadata": {}, + "source": [ + "## Creating a Redis vector store\n", + "First we'll want to create a Redis vector store and seed it with some data. We've created a small demo set of documents that contain summaries of movies.\n", + "\n", + "**Note:** The self-query retriever requires you to have `lark` installed (`pip install lark`) along with integration-specific requirements." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "63a8af5b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# !pip install redis redisvl openai tiktoken lark" + ] + }, + { + "cell_type": "markdown", + "id": "83811610-7df3-4ede-b268-68a6a83ba9e2", + "metadata": {}, + "source": [ + "We want to use `OpenAIEmbeddings` so we have to get the OpenAI API Key." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "dd01b61b-7d32-4a55-85d6-b2d2d4f18840", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "import getpass\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "cb4a5787", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.schema import Document\n", + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.vectorstores import Redis\n", + "\n", + "embeddings = OpenAIEmbeddings()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "bcbe04d9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "docs = [\n", + " Document(\n", + " page_content=\"A bunch of scientists bring back dinosaurs and mayhem breaks loose\",\n", + " metadata={\"year\": 1993, \"rating\": 7.7, \"director\": \"Steven Spielberg\", \"genre\": \"science fiction\"},\n", + " ),\n", + " Document(\n", + " page_content=\"Leo DiCaprio gets lost in a dream within a dream within a dream within a ...\",\n", + " metadata={\"year\": 2010, \"director\": \"Christopher Nolan\", \"genre\": \"science fiction\", \"rating\": 8.2},\n", + " ),\n", + " Document(\n", + " page_content=\"A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea\",\n", + " metadata={\"year\": 2006, \"director\": \"Satoshi Kon\", \"genre\": \"science fiction\", \"rating\": 8.6},\n", + " ),\n", + " Document(\n", + " page_content=\"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n", + " metadata={\"year\": 2019, \"director\": \"Greta Gerwig\", \"genre\": \"drama\", \"rating\": 8.3},\n", + " ),\n", + " Document(\n", + " page_content=\"Toys come alive and have a blast doing so\",\n", + " metadata={\"year\": 1995, \"director\": \"John Lasseter\", \"genre\": \"animated\", \"rating\": 9.1,},\n", + " ),\n", + " Document(\n", + " page_content=\"Three men walk into the Zone, three men walk out of the Zone\",\n", + " metadata={\n", + " \"year\": 1979,\n", + " \"rating\": 9.9,\n", + " \"director\": \"Andrei Tarkovsky\",\n", + " \"genre\": \"science fiction\",\n", + " },\n", + " ),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "393aff3b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "`index_schema` does not match generated metadata schema.\n", + "If you meant to manually override the schema, please ignore this message.\n", + "index_schema: {'tag': [{'name': 'genre'}], 'text': [{'name': 'director'}], 'numeric': [{'name': 'year'}, {'name': 'rating'}]}\n", + "generated_schema: {'text': [{'name': 'director'}, {'name': 'genre'}], 'numeric': [{'name': 'year'}, {'name': 'rating'}], 'tag': []}\n", + "\n" + ] + } + ], + "source": [ + "index_schema = {\n", + " \"tag\": [{\"name\": \"genre\"}],\n", + " \"text\": [{\"name\": \"director\"}],\n", + " \"numeric\": [{\"name\": \"year\"}, {\"name\": \"rating\"}],\n", + "}\n", + "\n", + "vectorstore = Redis.from_documents(\n", + " docs, \n", + " embeddings, \n", + " redis_url=\"redis://localhost:6379\",\n", + " index_name=\"movie_reviews\",\n", + " index_schema=index_schema,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "5ecaab6d", + "metadata": {}, + "source": [ + "## Creating our self-querying retriever\n", + "Now we can instantiate our retriever. To do this we'll need to provide some information upfront about the metadata fields that our documents support and a short description of the document contents." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "86e34dbf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.llms import OpenAI\n", + "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", + "from langchain.chains.query_constructor.base import AttributeInfo\n", + "\n", + "metadata_field_info = [\n", + " AttributeInfo(\n", + " name=\"genre\",\n", + " description=\"The genre of the movie\",\n", + " type=\"string or list[string]\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"year\",\n", + " description=\"The year the movie was released\",\n", + " type=\"integer\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"director\",\n", + " description=\"The name of the movie director\",\n", + " type=\"string\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"rating\", description=\"A 1-10 rating for the movie\", type=\"float\"\n", + " ),\n", + "]\n", + "document_content_description = \"Brief summary of a movie\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ea1126cb", + "metadata": {}, + "outputs": [], + "source": [ + "llm = OpenAI(temperature=0)\n", + "retriever = SelfQueryRetriever.from_llm(\n", + " llm, \n", + " vectorstore, \n", + " document_content_description, \n", + " metadata_field_info, \n", + " verbose=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ea9df8d4", + "metadata": {}, + "source": [ + "## Testing it out\n", + "And now we can try actually using our retriever!" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "38a126e9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/bagatur/langchain/libs/langchain/langchain/chains/llm.py:278: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='dinosaur' filter=None limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'id': 'doc:movie_reviews:7b5481d753bc4135851b66fa61def7fb', 'director': 'Steven Spielberg', 'genre': 'science fiction', 'year': '1993', 'rating': '7.7'}),\n", + " Document(page_content='Toys come alive and have a blast doing so', metadata={'id': 'doc:movie_reviews:9e4e84daa0374941a6aa4274e9bbb607', 'director': 'John Lasseter', 'genre': 'animated', 'year': '1995', 'rating': '9.1'}),\n", + " Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'id': 'doc:movie_reviews:2cc66f38bfbd438eb3a045d90a1a4088', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'year': '1979', 'rating': '9.9'}),\n", + " Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'id': 'doc:movie_reviews:edf567b1d5334e02b2a4c692d853c80c', 'director': 'Satoshi Kon', 'genre': 'science fiction', 'year': '2006', 'rating': '8.6'})]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a relevant query\n", + "retriever.get_relevant_documents(\"What are some movies about dinosaurs\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "fc3f1e6e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Comparison(comparator=, attribute='rating', value=8.4) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Toys come alive and have a blast doing so', metadata={'id': 'doc:movie_reviews:9e4e84daa0374941a6aa4274e9bbb607', 'director': 'John Lasseter', 'genre': 'animated', 'year': '1995', 'rating': '9.1'}),\n", + " Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'id': 'doc:movie_reviews:2cc66f38bfbd438eb3a045d90a1a4088', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'year': '1979', 'rating': '9.9'}),\n", + " Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'id': 'doc:movie_reviews:edf567b1d5334e02b2a4c692d853c80c', 'director': 'Satoshi Kon', 'genre': 'science fiction', 'year': '2006', 'rating': '8.6'})]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a filter\n", + "retriever.get_relevant_documents(\"I want to watch a movie rated higher than 8.4\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b19d4da0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='women' filter=Comparison(comparator=, attribute='director', value='Greta Gerwig') limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them', metadata={'id': 'doc:movie_reviews:bb899807b93c442083fd45e75a4779d5', 'director': 'Greta Gerwig', 'genre': 'drama', 'year': '2019', 'rating': '8.3'})]" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a query and a filter\n", + "retriever.get_relevant_documents(\"Has Greta Gerwig directed any movies about women\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f900e40e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Operation(operator=, arguments=[Comparison(comparator=, attribute='rating', value=8.5), Comparison(comparator=, attribute='genre', value='science fiction')]) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'id': 'doc:movie_reviews:2cc66f38bfbd438eb3a045d90a1a4088', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'year': '1979', 'rating': '9.9'}),\n", + " Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'id': 'doc:movie_reviews:edf567b1d5334e02b2a4c692d853c80c', 'director': 'Satoshi Kon', 'genre': 'science fiction', 'year': '2006', 'rating': '8.6'})]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a composite filter\n", + "retriever.get_relevant_documents(\n", + " \"What's a highly rated (above 8.5) science fiction film?\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "12a51522", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='toys' filter=Operation(operator=, arguments=[Comparison(comparator=, attribute='year', value=1990), Comparison(comparator=, attribute='year', value=2005), Comparison(comparator=, attribute='genre', value='animated')]) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Toys come alive and have a blast doing so', metadata={'id': 'doc:movie_reviews:9e4e84daa0374941a6aa4274e9bbb607', 'director': 'John Lasseter', 'genre': 'animated', 'year': '1995', 'rating': '9.1'})]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a query and composite filter\n", + "retriever.get_relevant_documents(\n", + " \"What's a movie after 1990 but before 2005 that's all about toys, and preferably is animated\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "39bd1de1-b9fe-4a98-89da-58d8a7a6ae51", + "metadata": {}, + "source": [ + "## Filter k\n", + "\n", + "We can also use the self query retriever to specify `k`: the number of documents to fetch.\n", + "\n", + "We can do this by passing `enable_limit=True` to the constructor." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "bff36b88-b506-4877-9c63-e5a1a8d78e64", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "retriever = SelfQueryRetriever.from_llm(\n", + " llm,\n", + " vectorstore,\n", + " document_content_description,\n", + " metadata_field_info,\n", + " enable_limit=True,\n", + " verbose=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2758d229-4f97-499c-819f-888acaf8ee10", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='dinosaur' filter=None limit=2\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'id': 'doc:movie_reviews:7b5481d753bc4135851b66fa61def7fb', 'director': 'Steven Spielberg', 'genre': 'science fiction', 'year': '1993', 'rating': '7.7'}),\n", + " Document(page_content='Toys come alive and have a blast doing so', metadata={'id': 'doc:movie_reviews:9e4e84daa0374941a6aa4274e9bbb607', 'director': 'John Lasseter', 'genre': 'animated', 'year': '1995', 'rating': '9.1'})]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a relevant query\n", + "retriever.get_relevant_documents(\"what are two movies about dinosaurs\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "poetry-venv", + "language": "python", + "name": "poetry-venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/modules/data_connection/retrievers/self_query/supabase_self_query.ipynb b/docs/extras/modules/data_connection/retrievers/self_query/supabase_self_query.ipynb new file mode 100644 index 0000000000..1414f70d38 --- /dev/null +++ b/docs/extras/modules/data_connection/retrievers/self_query/supabase_self_query.ipynb @@ -0,0 +1,587 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "13afcae7", + "metadata": {}, + "source": [ + "# Supabase Vector self-querying \n", + "\n", + ">[Supabase](https://supabase.com/docs) is an open source `Firebase` alternative. \n", + "> `Supabase` is built on top of `PostgreSQL`, which offers strong `SQL` \n", + "> querying capabilities and enables a simple interface with already-existing tools and frameworks.\n", + "\n", + ">[PostgreSQL](https://en.wikipedia.org/wiki/PostgreSQL) also known as `Postgres`,\n", + "> is a free and open-source relational database management system (RDBMS) \n", + "> emphasizing extensibility and `SQL` compliance.\n", + "\n", + "In the notebook we'll demo the `SelfQueryRetriever` wrapped around a Supabase vector store.\n", + "\n", + "Specifically we will:\n", + "1. Create a Supabase database\n", + "2. Enable the `pgvector` extension\n", + "3. Create a `documents` table and `match_documents` function that will be used by `SupabaseVectorStore`\n", + "4. Load sample documents into the vector store (database table)\n", + "5. Build and test a self-querying retriever" + ] + }, + { + "cell_type": "markdown", + "id": "347935ad", + "metadata": {}, + "source": [ + "## Setup Supabase Database\n", + "\n", + "1. Head over to https://database.new to provision your Supabase database.\n", + "2. In the studio, jump to the [SQL editor](https://supabase.com/dashboard/project/_/sql/new) and run the following script to enable `pgvector` and setup your database as a vector store:\n", + " ```sql\n", + " -- Enable the pgvector extension to work with embedding vectors\n", + " create extension if not exists vector;\n", + "\n", + " -- Create a table to store your documents\n", + " create table\n", + " documents (\n", + " id uuid primary key,\n", + " content text, -- corresponds to Document.pageContent\n", + " metadata jsonb, -- corresponds to Document.metadata\n", + " embedding vector (1536) -- 1536 works for OpenAI embeddings, change if needed\n", + " );\n", + "\n", + " -- Create a function to search for documents\n", + " create function match_documents (\n", + " query_embedding vector (1536),\n", + " filter jsonb default '{}'\n", + " ) returns table (\n", + " id uuid,\n", + " content text,\n", + " metadata jsonb,\n", + " similarity float\n", + " ) language plpgsql as $$\n", + " #variable_conflict use_column\n", + " begin\n", + " return query\n", + " select\n", + " id,\n", + " content,\n", + " metadata,\n", + " 1 - (documents.embedding <=> query_embedding) as similarity\n", + " from documents\n", + " where metadata @> filter\n", + " order by documents.embedding <=> query_embedding;\n", + " end;\n", + " $$;\n", + " ```" + ] + }, + { + "cell_type": "markdown", + "id": "68e75fb9", + "metadata": {}, + "source": [ + "## Creating a Supabase vector store\n", + "Next we'll want to create a Supabase vector store and seed it with some data. We've created a small demo set of documents that contain summaries of movies.\n", + "\n", + "Be sure to install the latest version of `langchain`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78546fd7", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install langchain" + ] + }, + { + "cell_type": "markdown", + "id": "e06df198", + "metadata": {}, + "source": [ + "The self-query retriever requires you to have `lark` installed:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "63a8af5b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%pip install lark" + ] + }, + { + "cell_type": "markdown", + "id": "114f768f", + "metadata": {}, + "source": [ + "We also need the `openai` and `supabase` packages:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "434ae558", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install openai" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "22431060-52c4-48a7-a97b-9f542b8b0928", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%pip install supabase==1.0.0" + ] + }, + { + "cell_type": "markdown", + "id": "83811610-7df3-4ede-b268-68a6a83ba9e2", + "metadata": {}, + "source": [ + "Since we are using `SupabaseVectorStore` and `OpenAIEmbeddings`, we have to load their API keys.\n", + "\n", + "- To find your `SUPABASE_URL` and `SUPABASE_SERVICE_KEY`, head to your Supabase project's [API settings](https://supabase.com/dashboard/project/_/settings/api).\n", + " - `SUPABASE_URL` corresponds to the Project URL\n", + " - `SUPABASE_SERVICE_KEY` corresponds to the `service_role` API key\n", + "\n", + "- To get your `OPENAI_API_KEY`, navigate to [API keys](https://platform.openai.com/account/api-keys) on your OpenAI account and create a new secret key." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "dd01b61b-7d32-4a55-85d6-b2d2d4f18840", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "import getpass\n", + "\n", + "os.environ[\"SUPABASE_URL\"] = getpass.getpass(\"Supabase URL:\")\n", + "os.environ[\"SUPABASE_SERVICE_KEY\"] = getpass.getpass(\"Supabase Service Key:\")\n", + "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")" + ] + }, + { + "cell_type": "markdown", + "id": "3aaf5075", + "metadata": {}, + "source": [ + "_Optional:_ If you're storing your Supabase and OpenAI API keys in a `.env` file, you can load them with [`dotenv`](https://github.com/theskumar/python-dotenv)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0089221", + "metadata": {}, + "outputs": [], + "source": [ + "%pip install python-dotenv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d56c5ef", + "metadata": {}, + "outputs": [], + "source": [ + "from dotenv import load_dotenv\n", + "\n", + "load_dotenv()" + ] + }, + { + "cell_type": "markdown", + "id": "f6dd9aef", + "metadata": {}, + "source": [ + "First we'll create a Supabase client and instantiate a OpenAI embeddings class." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "cb4a5787", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "from supabase.client import Client, create_client\n", + "from langchain.schema import Document\n", + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.vectorstores import SupabaseVectorStore\n", + "\n", + "supabase_url = os.environ.get(\"SUPABASE_URL\")\n", + "supabase_key = os.environ.get(\"SUPABASE_SERVICE_KEY\")\n", + "supabase: Client = create_client(supabase_url, supabase_key)\n", + "\n", + "embeddings = OpenAIEmbeddings()" + ] + }, + { + "cell_type": "markdown", + "id": "0fca9b0b", + "metadata": {}, + "source": [ + "Next let's create our documents." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bcbe04d9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "docs = [\n", + " Document(\n", + " page_content=\"A bunch of scientists bring back dinosaurs and mayhem breaks loose\",\n", + " metadata={\"year\": 1993, \"rating\": 7.7, \"genre\": \"science fiction\"},\n", + " ),\n", + " Document(\n", + " page_content=\"Leo DiCaprio gets lost in a dream within a dream within a dream within a ...\",\n", + " metadata={\"year\": 2010, \"director\": \"Christopher Nolan\", \"rating\": 8.2},\n", + " ),\n", + " Document(\n", + " page_content=\"A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea\",\n", + " metadata={\"year\": 2006, \"director\": \"Satoshi Kon\", \"rating\": 8.6},\n", + " ),\n", + " Document(\n", + " page_content=\"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n", + " metadata={\"year\": 2019, \"director\": \"Greta Gerwig\", \"rating\": 8.3},\n", + " ),\n", + " Document(\n", + " page_content=\"Toys come alive and have a blast doing so\",\n", + " metadata={\"year\": 1995, \"genre\": \"animated\"},\n", + " ),\n", + " Document(\n", + " page_content=\"Three men walk into the Zone, three men walk out of the Zone\",\n", + " metadata={\n", + " \"year\": 1979,\n", + " \"rating\": 9.9,\n", + " \"director\": \"Andrei Tarkovsky\",\n", + " \"genre\": \"science fiction\",\n", + " \"rating\": 9.9,\n", + " },\n", + " ),\n", + "]\n", + "\n", + "vectorstore = SupabaseVectorStore.from_documents(docs, embeddings, client=supabase, table_name=\"documents\", query_name=\"match_documents\")" + ] + }, + { + "cell_type": "markdown", + "id": "5ecaab6d", + "metadata": {}, + "source": [ + "## Creating our self-querying retriever\n", + "Now we can instantiate our retriever. To do this we'll need to provide some information upfront about the metadata fields that our documents support and a short description of the document contents." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "86e34dbf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.llms import OpenAI\n", + "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", + "from langchain.chains.query_constructor.base import AttributeInfo\n", + "\n", + "metadata_field_info = [\n", + " AttributeInfo(\n", + " name=\"genre\",\n", + " description=\"The genre of the movie\",\n", + " type=\"string or list[string]\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"year\",\n", + " description=\"The year the movie was released\",\n", + " type=\"integer\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"director\",\n", + " description=\"The name of the movie director\",\n", + " type=\"string\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"rating\", description=\"A 1-10 rating for the movie\", type=\"float\"\n", + " ),\n", + "]\n", + "document_content_description = \"Brief summary of a movie\"\n", + "llm = OpenAI(temperature=0)\n", + "retriever = SelfQueryRetriever.from_llm(\n", + " llm, vectorstore, document_content_description, metadata_field_info, verbose=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ea9df8d4", + "metadata": {}, + "source": [ + "## Testing it out\n", + "And now we can try actually using our retriever!" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "38a126e9", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='dinosaur' filter=None limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'year': 1993, 'genre': 'science fiction', 'rating': 7.7}),\n", + " Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'}),\n", + " Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'genre': 'science fiction', 'rating': 9.9, 'director': 'Andrei Tarkovsky'}),\n", + " Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'year': 2006, 'rating': 8.6, 'director': 'Satoshi Kon'})]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a relevant query\n", + "retriever.get_relevant_documents(\"What are some movies about dinosaurs\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fc3f1e6e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Comparison(comparator=, attribute='rating', value=8.5) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'genre': 'science fiction', 'rating': 9.9, 'director': 'Andrei Tarkovsky'}),\n", + " Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'year': 2006, 'rating': 8.6, 'director': 'Satoshi Kon'})]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a filter\n", + "retriever.get_relevant_documents(\"I want to watch a movie rated higher than 8.5\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "b19d4da0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='women' filter=Comparison(comparator=, attribute='director', value='Greta Gerwig') limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them', metadata={'year': 2019, 'rating': 8.3, 'director': 'Greta Gerwig'})]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a query and a filter\n", + "retriever.get_relevant_documents(\"Has Greta Gerwig directed any movies about women?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f900e40e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Operation(operator=, arguments=[Comparison(comparator=, attribute='rating', value=8.5), Comparison(comparator=, attribute='genre', value='science fiction')]) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'year': 1979, 'genre': 'science fiction', 'rating': 9.9, 'director': 'Andrei Tarkovsky'})]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a composite filter\n", + "retriever.get_relevant_documents(\n", + " \"What's a highly rated (above 8.5) science fiction film?\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "12a51522", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='toys' filter=Operation(operator=, arguments=[Comparison(comparator=, attribute='year', value=1990), Comparison(comparator=, attribute='year', value=2005), Comparison(comparator=, attribute='genre', value='animated')]) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'})]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a query and composite filter\n", + "retriever.get_relevant_documents(\n", + " \"What's a movie after 1990 but before (or on) 2005 that's all about toys, and preferably is animated\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "39bd1de1-b9fe-4a98-89da-58d8a7a6ae51", + "metadata": {}, + "source": [ + "## Filter k\n", + "\n", + "We can also use the self query retriever to specify `k`: the number of documents to fetch.\n", + "\n", + "We can do this by passing `enable_limit=True` to the constructor." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "bff36b88-b506-4877-9c63-e5a1a8d78e64", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "retriever = SelfQueryRetriever.from_llm(\n", + " llm,\n", + " vectorstore,\n", + " document_content_description,\n", + " metadata_field_info,\n", + " enable_limit=True,\n", + " verbose=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2758d229-4f97-499c-819f-888acaf8ee10", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='dinosaur' filter=None limit=2\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'year': 1993, 'genre': 'science fiction', 'rating': 7.7}),\n", + " Document(page_content='Toys come alive and have a blast doing so', metadata={'year': 1995, 'genre': 'animated'})]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a relevant query\n", + "retriever.get_relevant_documents(\"what are two movies about dinosaurs\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/modules/data_connection/retrievers/self_query/vectara_self_query.ipynb b/docs/extras/modules/data_connection/retrievers/self_query/vectara_self_query.ipynb new file mode 100644 index 0000000000..1e9128dc6f --- /dev/null +++ b/docs/extras/modules/data_connection/retrievers/self_query/vectara_self_query.ipynb @@ -0,0 +1,440 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "13afcae7", + "metadata": {}, + "source": [ + "# Vectara self-querying \n", + "\n", + ">[Vectara](https://docs.vectara.com/docs/) is a GenAI platform for developers. It provides a simple API to build Grounded Generation (aka Retrieval-augmented-generation) applications.\n", + "\n", + "In the notebook we'll demo the `SelfQueryRetriever` wrapped around a Vectara vector store. " + ] + }, + { + "cell_type": "markdown", + "id": "68e75fb9", + "metadata": {}, + "source": [ + "# Setup\n", + "\n", + "You will need a Vectara account to use Vectara with LangChain. To get started, use the following steps (see our [quickstart](https://docs.vectara.com/docs/quickstart) guide):\n", + "1. [Sign up](https://console.vectara.com/signup) for a Vectara account if you don't already have one. Once you have completed your sign up you will have a Vectara customer ID. You can find your customer ID by clicking on your name, on the top-right of the Vectara console window.\n", + "2. Within your account you can create one or more corpora. Each corpus represents an area that stores text data upon ingest from input documents. To create a corpus, use the **\"Create Corpus\"** button. You then provide a name to your corpus as well as a description. Optionally you can define filtering attributes and apply some advanced options. If you click on your created corpus, you can see its name and corpus ID right on the top.\n", + "3. Next you'll need to create API keys to access the corpus. Click on the **\"Authorization\"** tab in the corpus view and then the **\"Create API Key\"** button. Give your key a name, and choose whether you want query only or query+index for your key. Click \"Create\" and you now have an active API key. Keep this key confidential. \n", + "\n", + "To use LangChain with Vectara, you'll need to have these three values: customer ID, corpus ID and api_key.\n", + "You can provide those to LangChain in two ways:\n", + "\n", + "1. Include in your environment these three variables: `VECTARA_CUSTOMER_ID`, `VECTARA_CORPUS_ID` and `VECTARA_API_KEY`.\n", + "\n", + "> For example, you can set these variables using os.environ and getpass as follows:\n", + "\n", + "```python\n", + "import os\n", + "import getpass\n", + "\n", + "os.environ[\"VECTARA_CUSTOMER_ID\"] = getpass.getpass(\"Vectara Customer ID:\")\n", + "os.environ[\"VECTARA_CORPUS_ID\"] = getpass.getpass(\"Vectara Corpus ID:\")\n", + "os.environ[\"VECTARA_API_KEY\"] = getpass.getpass(\"Vectara API Key:\")\n", + "```\n", + "\n", + "1. Provide them as arguments when creating the Vectara vectorstore object:\n", + "\n", + "```python\n", + "vectorstore = Vectara(\n", + " vectara_customer_id=vectara_customer_id,\n", + " vectara_corpus_id=vectara_corpus_id,\n", + " vectara_api_key=vectara_api_key\n", + " )\n", + "```\n", + "\n", + "**Note:** The self-query retriever requires you to have `lark` installed (`pip install lark`). " + ] + }, + { + "cell_type": "markdown", + "id": "742ac16d", + "metadata": {}, + "source": [ + "## Connecting to Vectara from LangChain\n", + "\n", + "In this example, we assume that you've created an account and a corpus, and added your VECTARA_CUSTOMER_ID, VECTARA_CORPUS_ID and VECTARA_API_KEY (created with permissions for both indexing and query) as environment variables.\n", + "\n", + "The corpus has 4 fields defined as metadata for filtering: year, director, rating, and genre\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "cb4a5787", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.embeddings import FakeEmbeddings\n", + "from langchain.schema import Document\n", + "from langchain.text_splitter import CharacterTextSplitter\n", + "from langchain.vectorstores import Vectara\n", + "from langchain.document_loaders import TextLoader\n", + "\n", + "from langchain.llms import OpenAI\n", + "from langchain.chains import ConversationalRetrievalChain\n", + "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", + "from langchain.chains.query_constructor.base import AttributeInfo\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "bcbe04d9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "docs = [\n", + " Document(\n", + " page_content=\"A bunch of scientists bring back dinosaurs and mayhem breaks loose\",\n", + " metadata={\"year\": 1993, \"rating\": 7.7, \"genre\": \"science fiction\"},\n", + " ),\n", + " Document(\n", + " page_content=\"Leo DiCaprio gets lost in a dream within a dream within a dream within a ...\",\n", + " metadata={\"year\": 2010, \"director\": \"Christopher Nolan\", \"rating\": 8.2},\n", + " ),\n", + " Document(\n", + " page_content=\"A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea\",\n", + " metadata={\"year\": 2006, \"director\": \"Satoshi Kon\", \"rating\": 8.6},\n", + " ),\n", + " Document(\n", + " page_content=\"A bunch of normal-sized women are supremely wholesome and some men pine after them\",\n", + " metadata={\"year\": 2019, \"director\": \"Greta Gerwig\", \"rating\": 8.3},\n", + " ),\n", + " Document(\n", + " page_content=\"Toys come alive and have a blast doing so\",\n", + " metadata={\"year\": 1995, \"genre\": \"animated\"},\n", + " ),\n", + " Document(\n", + " page_content=\"Three men walk into the Zone, three men walk out of the Zone\",\n", + " metadata={\n", + " \"year\": 1979,\n", + " \"rating\": 9.9,\n", + " \"director\": \"Andrei Tarkovsky\",\n", + " \"genre\": \"science fiction\",\n", + " },\n", + " ),\n", + "]\n", + "\n", + "vectara = Vectara()\n", + "for doc in docs:\n", + " vectara.add_texts([doc.page_content], embedding=FakeEmbeddings(size=768), doc_metadata=doc.metadata)" + ] + }, + { + "cell_type": "markdown", + "id": "5ecaab6d", + "metadata": {}, + "source": [ + "## Creating our self-querying retriever\n", + "Now we can instantiate our retriever. To do this we'll need to provide some information upfront about the metadata fields that our documents support and a short description of the document contents." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "86e34dbf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.llms import OpenAI\n", + "from langchain.retrievers.self_query.base import SelfQueryRetriever\n", + "from langchain.chains.query_constructor.base import AttributeInfo\n", + "\n", + "metadata_field_info = [\n", + " AttributeInfo(\n", + " name=\"genre\",\n", + " description=\"The genre of the movie\",\n", + " type=\"string or list[string]\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"year\",\n", + " description=\"The year the movie was released\",\n", + " type=\"integer\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"director\",\n", + " description=\"The name of the movie director\",\n", + " type=\"string\",\n", + " ),\n", + " AttributeInfo(\n", + " name=\"rating\", description=\"A 1-10 rating for the movie\", type=\"float\"\n", + " ),\n", + "]\n", + "document_content_description = \"Brief summary of a movie\"\n", + "llm = OpenAI(temperature=0)\n", + "retriever = SelfQueryRetriever.from_llm(\n", + " llm, vectara, document_content_description, metadata_field_info, verbose=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "ea9df8d4", + "metadata": {}, + "source": [ + "## Testing it out\n", + "And now we can try actually using our retriever!" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "38a126e9", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/ofer/dev/langchain/libs/langchain/langchain/chains/llm.py:278: UserWarning: The predict_and_parse method is deprecated, instead pass an output parser directly to LLMChain.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='dinosaur' filter=None limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'lang': 'eng', 'offset': '0', 'len': '66', 'year': '1993', 'rating': '7.7', 'genre': 'science fiction', 'source': 'langchain'}),\n", + " Document(page_content='Toys come alive and have a blast doing so', metadata={'lang': 'eng', 'offset': '0', 'len': '41', 'year': '1995', 'genre': 'animated', 'source': 'langchain'}),\n", + " Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'lang': 'eng', 'offset': '0', 'len': '60', 'year': '1979', 'rating': '9.9', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'source': 'langchain'}),\n", + " Document(page_content='Leo DiCaprio gets lost in a dream within a dream within a dream within a ...', metadata={'lang': 'eng', 'offset': '0', 'len': '76', 'year': '2010', 'director': 'Christopher Nolan', 'rating': '8.2', 'source': 'langchain'}),\n", + " Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'lang': 'eng', 'offset': '0', 'len': '116', 'year': '2006', 'director': 'Satoshi Kon', 'rating': '8.6', 'source': 'langchain'})]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a relevant query\n", + "retriever.get_relevant_documents(\"What are some movies about dinosaurs\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fc3f1e6e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Comparison(comparator=, attribute='rating', value=8.5) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'lang': 'eng', 'offset': '0', 'len': '60', 'year': '1979', 'rating': '9.9', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'source': 'langchain'}),\n", + " Document(page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea', metadata={'lang': 'eng', 'offset': '0', 'len': '116', 'year': '2006', 'director': 'Satoshi Kon', 'rating': '8.6', 'source': 'langchain'})]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a filter\n", + "retriever.get_relevant_documents(\"I want to watch a movie rated higher than 8.5\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "b19d4da0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='women' filter=Comparison(comparator=, attribute='director', value='Greta Gerwig') limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them', metadata={'lang': 'eng', 'offset': '0', 'len': '82', 'year': '2019', 'director': 'Greta Gerwig', 'rating': '8.3', 'source': 'langchain'})]" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a query and a filter\n", + "retriever.get_relevant_documents(\"Has Greta Gerwig directed any movies about women\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "f900e40e", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query=' ' filter=Operation(operator=, arguments=[Comparison(comparator=, attribute='rating', value=8.5), Comparison(comparator=, attribute='genre', value='science fiction')]) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Three men walk into the Zone, three men walk out of the Zone', metadata={'lang': 'eng', 'offset': '0', 'len': '60', 'year': '1979', 'rating': '9.9', 'director': 'Andrei Tarkovsky', 'genre': 'science fiction', 'source': 'langchain'})]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a composite filter\n", + "retriever.get_relevant_documents(\n", + " \"What's a highly rated (above 8.5) science fiction film?\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "12a51522", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='toys' filter=Operation(operator=, arguments=[Comparison(comparator=, attribute='year', value=1990), Comparison(comparator=, attribute='year', value=2005), Comparison(comparator=, attribute='genre', value='animated')]) limit=None\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='Toys come alive and have a blast doing so', metadata={'lang': 'eng', 'offset': '0', 'len': '41', 'year': '1995', 'genre': 'animated', 'source': 'langchain'})]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example specifies a query and composite filter\n", + "retriever.get_relevant_documents(\n", + " \"What's a movie after 1990 but before 2005 that's all about toys, and preferably is animated\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "39bd1de1-b9fe-4a98-89da-58d8a7a6ae51", + "metadata": {}, + "source": [ + "## Filter k\n", + "\n", + "We can also use the self query retriever to specify `k`: the number of documents to fetch.\n", + "\n", + "We can do this by passing `enable_limit=True` to the constructor." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "bff36b88-b506-4877-9c63-e5a1a8d78e64", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "retriever = SelfQueryRetriever.from_llm(\n", + " llm,\n", + " vectara,\n", + " document_content_description,\n", + " metadata_field_info,\n", + " enable_limit=True,\n", + " verbose=True,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2758d229-4f97-499c-819f-888acaf8ee10", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "query='dinosaur' filter=None limit=2\n" + ] + }, + { + "data": { + "text/plain": [ + "[Document(page_content='A bunch of scientists bring back dinosaurs and mayhem breaks loose', metadata={'lang': 'eng', 'offset': '0', 'len': '66', 'year': '1993', 'rating': '7.7', 'genre': 'science fiction', 'source': 'langchain'}),\n", + " Document(page_content='Toys come alive and have a blast doing so', metadata={'lang': 'eng', 'offset': '0', 'len': '41', 'year': '1995', 'genre': 'animated', 'source': 'langchain'})]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# This example only specifies a relevant query\n", + "retriever.get_relevant_documents(\"what are two movies about dinosaurs\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/use_cases/apis.ipynb b/docs/extras/use_cases/apis.ipynb index 1af0a7f3ce..8d9259c3ca 100644 --- a/docs/extras/use_cases/apis.ipynb +++ b/docs/extras/use_cases/apis.ipynb @@ -1,12 +1,21 @@ { "cells": [ + { + "cell_type": "raw", + "id": "ea5c61b2-8b52-4270-bdb0-c4df88608f15", + "metadata": {}, + "source": [ + "---\n", + "sidebar_position: 1\n", + "title: Interacting with APIs\n", + "---" + ] + }, { "cell_type": "markdown", "id": "a15e6a18", "metadata": {}, "source": [ - "# Interacting with APIs\n", - "\n", "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/apis.ipynb)\n", "\n", "## Use case \n", @@ -69,9 +78,7 @@ "cell_type": "code", "execution_count": 2, "id": "30b780e3", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [ { "name": "stderr", @@ -415,7 +422,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/extras/use_cases/chatbots.ipynb b/docs/extras/use_cases/chatbots.ipynb index 58e3ce5317..c67d595c9f 100644 --- a/docs/extras/use_cases/chatbots.ipynb +++ b/docs/extras/use_cases/chatbots.ipynb @@ -1,12 +1,21 @@ { "cells": [ + { + "cell_type": "raw", + "id": "22fd28c9-9b48-476c-bca8-20efef7fdb14", + "metadata": {}, + "source": [ + "---\n", + "sidebar_position: 1\n", + "title: Chatbots\n", + "---" + ] + }, { "cell_type": "markdown", "id": "ee7f95e4", "metadata": {}, "source": [ - "# Chatbots\n", - "\n", "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/chatbots.ipynb)\n", "\n", "## Use case\n", diff --git a/docs/extras/use_cases/code_understanding.ipynb b/docs/extras/use_cases/code_understanding.ipynb index 60a02b9bb3..df0cfbf9d1 100644 --- a/docs/extras/use_cases/code_understanding.ipynb +++ b/docs/extras/use_cases/code_understanding.ipynb @@ -1,11 +1,19 @@ { "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "sidebar_position: 1\n", + "title: Code understanding\n", + "---" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Code Understanding\n", - "\n", "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/code_understanding.ipynb)\n", "\n", "## Use case\n", @@ -1047,7 +1055,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/extras/use_cases/extraction.ipynb b/docs/extras/use_cases/extraction.ipynb index 7aaa37f046..628026127a 100644 --- a/docs/extras/use_cases/extraction.ipynb +++ b/docs/extras/use_cases/extraction.ipynb @@ -1,12 +1,21 @@ { "cells": [ + { + "cell_type": "raw", + "id": "df29b30a-fd27-4e08-8269-870df5631f9e", + "metadata": {}, + "source": [ + "---\n", + "sidebar_position: 1\n", + "title: Extraction\n", + "---" + ] + }, { "cell_type": "markdown", "id": "b84edb4e", "metadata": {}, "source": [ - "# Extraction\n", - "\n", "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/extraction.ipynb)\n", "\n", "## Use case\n", @@ -589,7 +598,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/extras/use_cases/more/_category_.yml b/docs/extras/use_cases/more/_category_.yml index 5e1490ecde..53055fb940 100644 --- a/docs/extras/use_cases/more/_category_.yml +++ b/docs/extras/use_cases/more/_category_.yml @@ -1,2 +1,2 @@ label: 'More' -position: 1 +position: 2 \ No newline at end of file diff --git a/docs/extras/use_cases/more/agents/agents.ipynb b/docs/extras/use_cases/more/agents/agents.ipynb index 98b65d1bbe..54ba5c29db 100644 --- a/docs/extras/use_cases/more/agents/agents.ipynb +++ b/docs/extras/use_cases/more/agents/agents.ipynb @@ -584,7 +584,7 @@ "\n", "Collectivly, this tells us: carefully inspect Agent traces and tool outputs. \n", "\n", - "As we saw with the [SQL use case](/docs/use_cases/sql), `ReAct agents` can be work very well for specific problems. \n", + "As we saw with the [SQL use case](/docs/use_cases/qa_structured/sql), `ReAct agents` can be work very well for specific problems. \n", "\n", "But, as shown here, the result is degraded relative to what we see with the OpenAI agent." ] diff --git a/docs/extras/use_cases/more/code_writing/index.mdx b/docs/extras/use_cases/more/code_writing/index.mdx index 4dd704b5a0..218b438515 100644 --- a/docs/extras/use_cases/more/code_writing/index.mdx +++ b/docs/extras/use_cases/more/code_writing/index.mdx @@ -1,7 +1,3 @@ ---- -sidebar_position: 0 ---- - # Code writing :::warning diff --git a/docs/extras/use_cases/more/graph/diffbot_graphtransformer.ipynb b/docs/extras/use_cases/more/graph/diffbot_graphtransformer.ipynb new file mode 100644 index 0000000000..f8961174ce --- /dev/null +++ b/docs/extras/use_cases/more/graph/diffbot_graphtransformer.ipynb @@ -0,0 +1,307 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "7f0b0c06-ee70-468c-8bf5-b023f9e5e0a2", + "metadata": {}, + "source": [ + "# Diffbot Graph Transformer\n", + "\n", + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/more/graph/diffbot_graphtransformer.ipynb)\n", + "\n", + "## Use case\n", + "\n", + "Text data often contain rich relationships and insights that can be useful for various analytics, recommendation engines, or knowledge management applications.\n", + "\n", + "Diffbot's NLP API allows for the extraction of entities, relationships, and semantic meaning from unstructured text data.\n", + "\n", + "By coupling Diffbot's NLP API with Neo4j, a graph database, you can create powerful, dynamic graph structures based on the information extracted from text. These graph structures are fully queryable and can be integrated into various applications.\n", + "\n", + "This combination allows for use cases such as:\n", + "\n", + "* Building knowledge graphs from textual documents, websites, or social media feeds.\n", + "* Generating recommendations based on semantic relationships in the data.\n", + "* Creating advanced search features that understand the relationships between entities.\n", + "* Building analytics dashboards that allow users to explore the hidden relationships in data.\n", + "\n", + "## Overview\n", + "\n", + "LangChain provides tools to interact with Graph Databases:\n", + "\n", + "1. `Construct knowledge graphs from text` using graph transformer and store integrations \n", + "2. `Query a graph database` using chains for query creation and execution\n", + "3. `Interact with a graph database` using agents for robust and flexible querying \n", + "\n", + "## Quickstart\n", + "\n", + "First, get required packages and set environment variables:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "975648da-b24f-4164-a671-6772179e12df", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install langchain langchain-experimental openai neo4j wikipedia" + ] + }, + { + "cell_type": "markdown", + "id": "77718977-629e-46c2-b091-f9191b9ec569", + "metadata": {}, + "source": [ + "## Diffbot NLP Service\n", + "\n", + "Diffbot's NLP service is a tool for extracting entities, relationships, and semantic context from unstructured text data.\n", + "This extracted information can be used to construct a knowledge graph.\n", + "To use their service, you'll need to obtain an API key from [Diffbot](https://www.diffbot.com/products/natural-language/)." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2cbf97d0-3682-439b-8750-b695ff726789", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer\n", + "\n", + "diffbot_api_key = \"DIFFBOT_API_KEY\"\n", + "diffbot_nlp = DiffbotGraphTransformer(diffbot_api_key=diffbot_api_key)" + ] + }, + { + "cell_type": "markdown", + "id": "5e3b894a-e3ee-46c7-8116-f8377f8f0159", + "metadata": {}, + "source": [ + "This code fetches Wikipedia articles about \"Warren Buffett\" and then uses `DiffbotGraphTransformer` to extract entities and relationships.\n", + "The `DiffbotGraphTransformer` outputs a structured data `GraphDocument`, which can be used to populate a graph database.\n", + "Note that text chunking is avoided due to Diffbot's [character limit per API request](https://docs.diffbot.com/reference/introduction-to-natural-language-api)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "53f8df86-47a1-44a1-9a0f-6725b90703bc", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import WikipediaLoader\n", + "\n", + "query = \"Warren Buffett\"\n", + "raw_documents = WikipediaLoader(query=query).load()\n", + "graph_documents = diffbot_nlp.convert_to_graph_documents(raw_documents)" + ] + }, + { + "cell_type": "markdown", + "id": "31bb851a-aab4-4b97-a6b7-fce397d32b47", + "metadata": {}, + "source": [ + "## Loading the data into a knowledge graph\n", + "\n", + "You will need to have a running Neo4j instance. One option is to create a [free Neo4j database instance in their Aura cloud service](https://neo4j.com/cloud/platform/aura-graph-database/). You can also run the database locally using the [Neo4j Desktop application](https://neo4j.com/download/), or running a docker container. You can run a local docker container by running the executing the following script:\n", + "```\n", + "docker run \\\n", + " --name neo4j \\\n", + " -p 7474:7474 -p 7687:7687 \\\n", + " -d \\\n", + " -e NEO4J_AUTH=neo4j/pleaseletmein \\\n", + " -e NEO4J_PLUGINS=\\[\\\"apoc\\\"\\] \\\n", + " neo4j:latest\n", + "``` \n", + "If you are using the docker container, you need to wait a couple of second for the database to start." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0b2b6641-5a5d-467c-b148-e6aad5e4baa7", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.graphs import Neo4jGraph\n", + "\n", + "url=\"bolt://localhost:7687\"\n", + "username=\"neo4j\"\n", + "password=\"pleaseletmein\"\n", + "\n", + "graph = Neo4jGraph(\n", + " url=url,\n", + " username=username, \n", + " password=password\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "0b15e840-fe6f-45db-9193-1b4e2df5c12c", + "metadata": {}, + "source": [ + "The `GraphDocuments` can be loaded into a knowledge graph using the `add_graph_documents` method." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "1a67c4a8-955c-42a2-9c5d-de3ac0e640ec", + "metadata": {}, + "outputs": [], + "source": [ + "graph.add_graph_documents(graph_documents)" + ] + }, + { + "cell_type": "markdown", + "id": "ed411e05-2b03-460d-997e-938482774f40", + "metadata": {}, + "source": [ + "## Refresh graph schema information\n", + "If the schema of database changes, you can refresh the schema information needed to generate Cypher statements" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "904c9ee3-787c-403f-857d-459ce5ad5a1b", + "metadata": {}, + "outputs": [], + "source": [ + "graph.refresh_schema()" + ] + }, + { + "cell_type": "markdown", + "id": "f19d1387-5899-4258-8c94-8ef5fa7db464", + "metadata": {}, + "source": [ + "## Querying the graph\n", + "We can now use the graph cypher QA chain to ask question of the graph. It is advisable to use **gpt-4** to construct Cypher queries to get the best experience." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "9393b732-67c8-45c1-9ec2-089f49c62448", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains import GraphCypherQAChain\n", + "from langchain.chat_models import ChatOpenAI\n", + "\n", + "chain = GraphCypherQAChain.from_llm(\n", + " cypher_llm=ChatOpenAI(temperature=0, model_name=\"gpt-4\"),\n", + " qa_llm=ChatOpenAI(temperature=0, model_name=\"gpt-3.5-turbo\"),\n", + " graph=graph, verbose=True,\n", + " \n", + ")\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "1a9b3652-b436-404d-aa25-5fb576f23dc0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", + "Generated Cypher:\n", + "\u001b[32;1m\u001b[1;3mMATCH (p:Person {name: \"Warren Buffett\"})-[:EDUCATED_AT]->(o:Organization)\n", + "RETURN o.name\u001b[0m\n", + "Full Context:\n", + "\u001b[32;1m\u001b[1;3m[{'o.name': 'New York Institute of Finance'}, {'o.name': 'Alice Deal Junior High School'}, {'o.name': 'Woodrow Wilson High School'}, {'o.name': 'University of Nebraska'}]\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "'Warren Buffett attended the University of Nebraska.'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.run(\"Which university did Warren Buffett attend?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "adc0ba0f-a62c-4875-89ce-da717f3ab148", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n", + "Generated Cypher:\n", + "\u001b[32;1m\u001b[1;3mMATCH (p:Person)-[r:EMPLOYEE_OR_MEMBER_OF]->(o:Organization) WHERE o.name = 'Berkshire Hathaway' RETURN p.name\u001b[0m\n", + "Full Context:\n", + "\u001b[32;1m\u001b[1;3m[{'p.name': 'Charlie Munger'}, {'p.name': 'Oliver Chace'}, {'p.name': 'Howard Buffett'}, {'p.name': 'Howard'}, {'p.name': 'Susan Buffett'}, {'p.name': 'Warren Buffett'}]\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "'Charlie Munger, Oliver Chace, Howard Buffett, Susan Buffett, and Warren Buffett are or were working at Berkshire Hathaway.'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "chain.run(\"Who is or was working at Berkshire Hathaway?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d636954b-d967-4e96-9489-92e11c74af35", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.4" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/use_cases/more/self_check/index.mdx b/docs/extras/use_cases/more/self_check/index.mdx index 9880394eb5..a424ea4370 100644 --- a/docs/extras/use_cases/more/self_check/index.mdx +++ b/docs/extras/use_cases/more/self_check/index.mdx @@ -1,7 +1,3 @@ ---- -sidebar_position: 0 ---- - # Self-checking One of the main issues with using LLMs is that they can often hallucinate and make false claims. One of the surprisingly effective ways to remediate this is to use the LLM itself to check its own answers. diff --git a/docs/extras/use_cases/qa_structured/_category_.yml b/docs/extras/use_cases/qa_structured/_category_.yml new file mode 100644 index 0000000000..209e3895ff --- /dev/null +++ b/docs/extras/use_cases/qa_structured/_category_.yml @@ -0,0 +1,3 @@ +label: 'QA over structured data' +collapsed: false +position: 0.5 diff --git a/docs/extras/use_cases/qa_structured/integrations/_category_.yml b/docs/extras/use_cases/qa_structured/integrations/_category_.yml new file mode 100644 index 0000000000..4a4b0b2f28 --- /dev/null +++ b/docs/extras/use_cases/qa_structured/integrations/_category_.yml @@ -0,0 +1 @@ +label: 'Integration-specific' diff --git a/docs/extras/use_cases/qa_structured/integrations/elasticsearch.ipynb b/docs/extras/use_cases/qa_structured/integrations/elasticsearch.ipynb new file mode 100644 index 0000000000..e28bc6bf61 --- /dev/null +++ b/docs/extras/use_cases/qa_structured/integrations/elasticsearch.ipynb @@ -0,0 +1,158 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Elasticsearch\n", + "\n", + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/qa_structured/integrations/elasticsearch.ipynb)\n", + "\n", + "We can use LLMs to interact with Elasticsearch analytics databases in natural language.\n", + "\n", + "This chain builds search queries via the Elasticsearch DSL API (filters and aggregations).\n", + "\n", + "The Elasticsearch client must have permissions for index listing, mapping description and search queries.\n", + "\n", + "See [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html) for instructions on how to run Elasticsearch locally." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install langchain langchain-experimental openai elasticsearch\n", + "\n", + "# Set env var OPENAI_API_KEY or load from a .env file\n", + "# import dotenv\n", + "\n", + "# dotenv.load_dotenv()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from elasticsearch import Elasticsearch\n", + "\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.chains.elasticsearch_database import ElasticsearchDatabaseChain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize Elasticsearch python client.\n", + "# See https://elasticsearch-py.readthedocs.io/en/v8.8.2/api.html#elasticsearch.Elasticsearch\n", + "ELASTIC_SEARCH_SERVER = \"https://elastic:pass@localhost:9200\"\n", + "db = Elasticsearch(ELASTIC_SEARCH_SERVER)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Uncomment the next cell to initially populate your db." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# customers = [\n", + "# {\"firstname\": \"Jennifer\", \"lastname\": \"Walters\"},\n", + "# {\"firstname\": \"Monica\",\"lastname\":\"Rambeau\"},\n", + "# {\"firstname\": \"Carol\",\"lastname\":\"Danvers\"},\n", + "# {\"firstname\": \"Wanda\",\"lastname\":\"Maximoff\"},\n", + "# {\"firstname\": \"Jennifer\",\"lastname\":\"Takeda\"},\n", + "# ]\n", + "# for i, customer in enumerate(customers):\n", + "# db.create(index=\"customers\", document=customer, id=i)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "llm = ChatOpenAI(model_name=\"gpt-4\", temperature=0)\n", + "chain = ElasticsearchDatabaseChain.from_llm(llm=llm, database=db, verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "question = \"What are the first names of all the customers?\"\n", + "chain.run(question)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can customize the prompt." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains.elasticsearch_database.prompts import DEFAULT_DSL_TEMPLATE\n", + "from langchain.prompts.prompt import PromptTemplate\n", + "\n", + "PROMPT_TEMPLATE = \"\"\"Given an input question, create a syntactically correct Elasticsearch query to run. Unless the user specifies in their question a specific number of examples they wish to obtain, always limit your query to at most {top_k} results. You can order the results by a relevant column to return the most interesting examples in the database.\n", + "\n", + "Unless told to do not query for all the columns from a specific index, only ask for a the few relevant columns given the question.\n", + "\n", + "Pay attention to use only the column names that you can see in the mapping description. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which index. Return the query as valid json.\n", + "\n", + "Use the following format:\n", + "\n", + "Question: Question here\n", + "ESQuery: Elasticsearch Query formatted as json\n", + "\"\"\"\n", + "\n", + "PROMPT = PromptTemplate.from_template(\n", + " PROMPT_TEMPLATE,\n", + ")\n", + "chain = ElasticsearchDatabaseChain.from_llm(llm=llm, database=db, query_prompt=PROMPT)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/extras/use_cases/qa_structured/integrations/myscale_vector_sql.ipynb b/docs/extras/use_cases/qa_structured/integrations/myscale_vector_sql.ipynb new file mode 100644 index 0000000000..65bd8323ed --- /dev/null +++ b/docs/extras/use_cases/qa_structured/integrations/myscale_vector_sql.ipynb @@ -0,0 +1,200 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "245065c6", + "metadata": {}, + "source": [ + "# Vector SQL Retriever with MyScale\n", + "\n", + ">[MyScale](https://docs.myscale.com/en/) is an integrated vector database. You can access your database in SQL and also from here, LangChain. MyScale can make a use of [various data types and functions for filters](https://blog.myscale.com/2023/06/06/why-integrated-database-solution-can-boost-your-llm-apps/#filter-on-anything-without-constraints). It will boost up your LLM app no matter if you are scaling up your data or expand your system to broader application." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0246c5bf", + "metadata": {}, + "outputs": [], + "source": [ + "!pip3 install clickhouse-sqlalchemy InstructorEmbedding sentence_transformers openai langchain-experimental" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7585d2c3", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from os import environ\n", + "import getpass\n", + "from typing import Dict, Any\n", + "from langchain import OpenAI, SQLDatabase, LLMChain\n", + "from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n", + "from sqlalchemy import create_engine, Column, MetaData\n", + "from langchain import PromptTemplate\n", + "\n", + "\n", + "from sqlalchemy import create_engine\n", + "\n", + "MYSCALE_HOST = \"msc-1decbcc9.us-east-1.aws.staging.myscale.cloud\"\n", + "MYSCALE_PORT = 443\n", + "MYSCALE_USER = \"chatdata\"\n", + "MYSCALE_PASSWORD = \"myscale_rocks\"\n", + "OPENAI_API_KEY = getpass.getpass(\"OpenAI API Key:\")\n", + "\n", + "engine = create_engine(\n", + " f\"clickhouse://{MYSCALE_USER}:{MYSCALE_PASSWORD}@{MYSCALE_HOST}:{MYSCALE_PORT}/default?protocol=https\"\n", + ")\n", + "metadata = MetaData(bind=engine)\n", + "environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e08d9ddc", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings import HuggingFaceInstructEmbeddings\n", + "from langchain_experimental.sql.vector_sql import VectorSQLOutputParser\n", + "\n", + "output_parser = VectorSQLOutputParser.from_embeddings(\n", + " model=HuggingFaceInstructEmbeddings(\n", + " model_name=\"hkunlp/instructor-xl\", model_kwargs={\"device\": \"cpu\"}\n", + " )\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84b705b2", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from langchain.llms import OpenAI\n", + "from langchain.callbacks import StdOutCallbackHandler\n", + "\n", + "from langchain.utilities.sql_database import SQLDatabase\n", + "from langchain_experimental.sql.prompt import MYSCALE_PROMPT\n", + "from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n", + "\n", + "chain = VectorSQLDatabaseChain(\n", + " llm_chain=LLMChain(\n", + " llm=OpenAI(openai_api_key=OPENAI_API_KEY, temperature=0),\n", + " prompt=MYSCALE_PROMPT,\n", + " ),\n", + " top_k=10,\n", + " return_direct=True,\n", + " sql_cmd_parser=output_parser,\n", + " database=SQLDatabase(engine, None, metadata),\n", + ")\n", + "\n", + "import pandas as pd\n", + "\n", + "pd.DataFrame(\n", + " chain.run(\n", + " \"Please give me 10 papers to ask what is PageRank?\",\n", + " callbacks=[StdOutCallbackHandler()],\n", + " )\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "6c09cda0", + "metadata": {}, + "source": [ + "## SQL Database as Retriever" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "734d7ff5", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain\n", + "\n", + "from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n", + "from langchain_experimental.retrievers.vector_sql_database \\\n", + " import VectorSQLDatabaseChainRetriever\n", + "from langchain_experimental.sql.prompt import MYSCALE_PROMPT\n", + "from langchain_experimental.sql.vector_sql import VectorSQLRetrieveAllOutputParser\n", + "\n", + "output_parser_retrieve_all = VectorSQLRetrieveAllOutputParser.from_embeddings(\n", + " output_parser.model\n", + ")\n", + "\n", + "chain = VectorSQLDatabaseChain.from_llm(\n", + " llm=OpenAI(openai_api_key=OPENAI_API_KEY, temperature=0),\n", + " prompt=MYSCALE_PROMPT,\n", + " top_k=10,\n", + " return_direct=True,\n", + " db=SQLDatabase(engine, None, metadata),\n", + " sql_cmd_parser=output_parser_retrieve_all,\n", + " native_format=True,\n", + ")\n", + "\n", + "# You need all those keys to get docs\n", + "retriever = VectorSQLDatabaseChainRetriever(sql_db_chain=chain, page_content_key=\"abstract\")\n", + "\n", + "document_with_metadata_prompt = PromptTemplate(\n", + " input_variables=[\"page_content\", \"id\", \"title\", \"authors\", \"pubdate\", \"categories\"],\n", + " template=\"Content:\\n\\tTitle: {title}\\n\\tAbstract: {page_content}\\n\\tAuthors: {authors}\\n\\tDate of Publication: {pubdate}\\n\\tCategories: {categories}\\nSOURCE: {id}\",\n", + ")\n", + "\n", + "chain = RetrievalQAWithSourcesChain.from_chain_type(\n", + " ChatOpenAI(\n", + " model_name=\"gpt-3.5-turbo-16k\", openai_api_key=OPENAI_API_KEY, temperature=0.6\n", + " ),\n", + " retriever=retriever,\n", + " chain_type=\"stuff\",\n", + " chain_type_kwargs={\n", + " \"document_prompt\": document_with_metadata_prompt,\n", + " },\n", + " return_source_documents=True,\n", + ")\n", + "ans = chain(\"Please give me 10 papers to ask what is PageRank?\",\n", + " callbacks=[StdOutCallbackHandler()])\n", + "print(ans[\"answer\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4948ff25", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/use_cases/qa_structured/sql.ipynb b/docs/extras/use_cases/qa_structured/sql.ipynb new file mode 100644 index 0000000000..23bde6a2a5 --- /dev/null +++ b/docs/extras/use_cases/qa_structured/sql.ipynb @@ -0,0 +1,1268 @@ +{ + "cells": [ + { + "cell_type": "raw", + "metadata": {}, + "source": [ + "---\n", + "title: SQL\n", + "sidebar_position: 2\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/qa_structured/sql.ipynb)\n", + "\n", + "## Use case\n", + "\n", + "Enterprise data is often stored in SQL databases.\n", + "\n", + "LLMs make it possible to interact with SQL databases using natural langugae.\n", + "\n", + "LangChain offers SQL Chains and Agents to build and run SQL queries based on natural language prompts. \n", + "\n", + "These are compatible with any SQL dialect supported by SQLAlchemy (e.g., MySQL, PostgreSQL, Oracle SQL, Databricks, SQLite).\n", + "\n", + "They enable use cases such as:\n", + "\n", + "- Generating queries that will be run based on natural language questions\n", + "- Creating chatbots that can answer questions based on database data\n", + "- Building custom dashboards based on insights a user wants to analyze\n", + "\n", + "## Overview\n", + "\n", + "LangChain provides tools to interact with SQL Databases:\n", + "\n", + "1. `Build SQL queries` based on natural language user questions\n", + "2. `Query a SQL database` using chains for query creation and execution\n", + "3. `Interact with a SQL database` using agents for robust and flexible querying \n", + "\n", + "![sql_usecase.png](/img/sql_usecase.png)\n", + "\n", + "## Quickstart\n", + "\n", + "First, get required packages and set environment variables:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "! pip install langchain langchain-experimental openai\n", + "\n", + "# Set env var OPENAI_API_KEY or load from a .env file\n", + "# import dotenv\n", + "\n", + "# dotenv.load_dotenv()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The below example will use a SQLite connection with Chinook database. \n", + " \n", + "Follow [installation steps](https://database.guide/2-sample-databases-sqlite/) to create `Chinook.db` in the same directory as this notebook:\n", + "\n", + "* Save [this file](https://raw.githubusercontent.com/lerocha/chinook-database/master/ChinookDatabase/DataSources/Chinook_Sqlite.sql) to the directory as `Chinook_Sqlite.sql`\n", + "* Run `sqlite3 Chinook.db`\n", + "* Run `.read Chinook_Sqlite.sql`\n", + "* Test `SELECT * FROM Artist LIMIT 10;`\n", + "\n", + "Now, `Chinhook.db` is in our directory.\n", + "\n", + "Let's create a `SQLDatabaseChain` to create and execute SQL queries." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.utilities import SQLDatabase\n", + "from langchain.llms import OpenAI\n", + "from langchain_experimental.sql import SQLDatabaseChain\n", + "\n", + "db = SQLDatabase.from_uri(\"sqlite:///Chinook.db\")\n", + "llm = OpenAI(temperature=0, verbose=True)\n", + "db_chain = SQLDatabaseChain.from_llm(llm, db, verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new SQLDatabaseChain chain...\u001b[0m\n", + "How many employees are there?\n", + "SQLQuery:\u001b[32;1m\u001b[1;3mSELECT COUNT(*) FROM \"Employee\";\u001b[0m\n", + "SQLResult: \u001b[33;1m\u001b[1;3m[(8,)]\u001b[0m\n", + "Answer:\u001b[32;1m\u001b[1;3mThere are 8 employees.\u001b[0m\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "'There are 8 employees.'" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db_chain.run(\"How many employees are there?\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Note that this both creates and executes the query. \n", + "\n", + "In the following sections, we will cover the 3 different use cases mentioned in the overview.\n", + "\n", + "### Go deeper\n", + "\n", + "You can load tabular data from other sources other than SQL Databases.\n", + "For example:\n", + "- [Loading a CSV file](/docs/integrations/document_loaders/csv)\n", + "- [Loading a Pandas DataFrame](/docs/integrations/document_loaders/pandas_dataframe)\n", + "Here you can [check full list of Document Loaders](/docs/integrations/document_loaders/)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Case 1: Text-to-SQL query\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.chains import create_sql_query_chain" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's create the chain that will build the SQL Query:\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SELECT COUNT(*) FROM Employee\n" + ] + } + ], + "source": [ + "chain = create_sql_query_chain(ChatOpenAI(temperature=0), db)\n", + "response = chain.invoke({\"question\":\"How many employees are there\"})\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After building the SQL query based on a user question, we can execute the query:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'[(8,)]'" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db.run(response)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see, the SQL Query Builder chain **only created** the query, and we handled the **query execution separately**." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Go deeper\n", + "\n", + "**Looking under the hood**\n", + "\n", + "We can look at the [LangSmith trace](https://smith.langchain.com/public/c8fa52ea-be46-4829-bde2-52894970b830/r) to unpack this:\n", + "\n", + "[Some papers](https://arxiv.org/pdf/2204.00498.pdf) have reported good performance when prompting with:\n", + " \n", + "* A `CREATE TABLE` description for each table, which include column names, their types, etc\n", + "* Followed by three example rows in a `SELECT` statement\n", + "\n", + "`create_sql_query_chain` adopts this the best practice (see more in this [blog](https://blog.langchain.dev/llms-and-sql/)). \n", + "![sql_usecase.png](/img/create_sql_query_chain.png)\n", + "\n", + "**Improvements**\n", + "\n", + "The query builder can be improved in several ways, such as (but not limited to):\n", + "\n", + "- Customizing database description to your specific use case\n", + "- Hardcoding a few examples of questions and their corresponding SQL query in the prompt\n", + "- Using a vector database to include dynamic examples that are relevant to the specific user question\n", + "\n", + "All these examples involve customizing the chain's prompt. \n", + "\n", + "For example, we can include a few examples in our prompt like so:" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.prompts import PromptTemplate\n", + "\n", + "TEMPLATE = \"\"\"Given an input question, first create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer.\n", + "Use the following format:\n", + "\n", + "Question: \"Question here\"\n", + "SQLQuery: \"SQL Query to run\"\n", + "SQLResult: \"Result of the SQLQuery\"\n", + "Answer: \"Final answer here\"\n", + "\n", + "Only use the following tables:\n", + "\n", + "{table_info}.\n", + "\n", + "Some examples of SQL queries that corrsespond to questions are:\n", + "\n", + "{few_shot_examples}\n", + "\n", + "Question: {input}\"\"\"\n", + "\n", + "CUSTOM_PROMPT = PromptTemplate(\n", + " input_variables=[\"input\", \"few_shot_examples\", \"table_info\", \"dialect\"], template=TEMPLATE\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also access this [prompt](https://smith.langchain.com/hub/rlm/text-to-sql) in the LangChain prompt hub.\n", + "\n", + "This will work with your [LangSmith API key](https://docs.smith.langchain.com/)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain import hub\n", + "CUSTOM_PROMPT = hub.pull(\"rlm/text-to-sql\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Case 2: Text-to-SQL query and execution\n", + "\n", + "We can use `SQLDatabaseChain` from `langchain_experimental` to create and run SQL queries." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.llms import OpenAI\n", + "from langchain_experimental.sql import SQLDatabaseChain\n", + "\n", + "llm = OpenAI(temperature=0, verbose=True)\n", + "db_chain = SQLDatabaseChain.from_llm(llm, db, verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new SQLDatabaseChain chain...\u001b[0m\n", + "How many employees are there?\n", + "SQLQuery:\u001b[32;1m\u001b[1;3mSELECT COUNT(*) FROM \"Employee\";\u001b[0m\n", + "SQLResult: \u001b[33;1m\u001b[1;3m[(8,)]\u001b[0m\n", + "Answer:\u001b[32;1m\u001b[1;3mThere are 8 employees.\u001b[0m\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "'There are 8 employees.'" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "db_chain.run(\"How many employees are there?\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see, we get the same result as the previous case.\n", + "\n", + "Here, the chain **also handles the query execution** and provides a final answer based on the user question and the query result.\n", + "\n", + "**Be careful** while using this approach as it is susceptible to `SQL Injection`:\n", + "\n", + "* The chain is executing queries that are created by an LLM, and weren't validated\n", + "* e.g. records may be created, modified or deleted unintentionally_\n", + "\n", + "This is why we see the `SQLDatabaseChain` is inside `langchain_experimental`." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Go deeper\n", + "\n", + "**Looking under the hood**\n", + "\n", + "We can use the [LangSmith trace](https://smith.langchain.com/public/7f202a0c-1e35-42d6-a84a-6c2a58f697ef/r) to see what is happening under the hood:\n", + "\n", + "* As discussed above, first we create the query:\n", + "\n", + "```\n", + "text: ' SELECT COUNT(*) FROM \"Employee\";'\n", + "```\n", + "\n", + "* Then, it executes the query and passes the results to an LLM for synthesis.\n", + "\n", + "![sql_usecase.png](/img/sqldbchain_trace.png)\n", + "\n", + "**Improvements**\n", + "\n", + "The performance of the `SQLDatabaseChain` can be enhanced in several ways:\n", + "\n", + "- [Adding sample rows](#adding-sample-rows)\n", + "- [Specifying custom table information](/docs/integrations/tools/sqlite#custom-table-info)\n", + "- [Using Query Checker](/docs/integrations/tools/sqlite#use-query-checker) self-correct invalid SQL using parameter `use_query_checker=True`\n", + "- [Customizing the LLM Prompt](/docs/integrations/tools/sqlite#customize-prompt) include specific instructions or relevant information, using parameter `prompt=CUSTOM_PROMPT`\n", + "- [Get intermediate steps](/docs/integrations/tools/sqlite#return-intermediate-steps) access the SQL statement as well as the final result using parameter `return_intermediate_steps=True`\n", + "- [Limit the number of rows](/docs/integrations/tools/sqlite#choosing-how-to-limit-the-number-of-rows-returned) a query will return using parameter `top_k=5`\n", + "\n", + "You might find [SQLDatabaseSequentialChain](/docs/integrations/tools/sqlite#sqldatabasesequentialchain)\n", + "useful for cases in which the number of tables in the database is large.\n", + "\n", + "This `Sequential Chain` handles the process of:\n", + "\n", + "1. Determining which tables to use based on the user question\n", + "2. Calling the normal SQL database chain using only relevant tables\n", + "\n", + "**Adding Sample Rows**\n", + "\n", + "Providing sample data can help the LLM construct correct queries when the data format is not obvious. \n", + "\n", + "For example, we can tell LLM that artists are saved with their full names by providing two rows from the Track table.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "db = SQLDatabase.from_uri(\n", + " \"sqlite:///Chinook.db\",\n", + " include_tables=['Track'], # we include only one table to save tokens in the prompt :)\n", + " sample_rows_in_table_info=2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The sample rows are added to the prompt after each corresponding table's column information.\n", + "\n", + "We can use `db.table_info` and check which sample rows are included:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "CREATE TABLE \"Track\" (\n", + "\t\"TrackId\" INTEGER NOT NULL, \n", + "\t\"Name\" NVARCHAR(200) NOT NULL, \n", + "\t\"AlbumId\" INTEGER, \n", + "\t\"MediaTypeId\" INTEGER NOT NULL, \n", + "\t\"GenreId\" INTEGER, \n", + "\t\"Composer\" NVARCHAR(220), \n", + "\t\"Milliseconds\" INTEGER NOT NULL, \n", + "\t\"Bytes\" INTEGER, \n", + "\t\"UnitPrice\" NUMERIC(10, 2) NOT NULL, \n", + "\tPRIMARY KEY (\"TrackId\"), \n", + "\tFOREIGN KEY(\"MediaTypeId\") REFERENCES \"MediaType\" (\"MediaTypeId\"), \n", + "\tFOREIGN KEY(\"GenreId\") REFERENCES \"Genre\" (\"GenreId\"), \n", + "\tFOREIGN KEY(\"AlbumId\") REFERENCES \"Album\" (\"AlbumId\")\n", + ")\n", + "\n", + "/*\n", + "2 rows from Track table:\n", + "TrackId\tName\tAlbumId\tMediaTypeId\tGenreId\tComposer\tMilliseconds\tBytes\tUnitPrice\n", + "1\tFor Those About To Rock (We Salute You)\t1\t1\t1\tAngus Young, Malcolm Young, Brian Johnson\t343719\t11170334\t0.99\n", + "2\tBalls to the Wall\t2\t2\t1\tNone\t342562\t5510424\t0.99\n", + "*/\n" + ] + } + ], + "source": [ + "print(db.table_info)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Case 3: SQL agents\n", + "\n", + "LangChain has an SQL Agent which provides a more flexible way of interacting with SQL Databases than the `SQLDatabaseChain`.\n", + "\n", + "The main advantages of using the SQL Agent are:\n", + "\n", + "- It can answer questions based on the databases' schema as well as on the databases' content (like describing a specific table)\n", + "- It can recover from errors by running a generated query, catching the traceback and regenerating it correctly\n", + "\n", + "To initialize the agent, we use `create_sql_agent` function. \n", + "\n", + "This agent contains the `SQLDatabaseToolkit` which contains tools to: \n", + "\n", + "* Create and execute queries\n", + "* Check query syntax\n", + "* Retrieve table descriptions\n", + "* ... and more" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.agents import create_sql_agent\n", + "from langchain.agents.agent_toolkits import SQLDatabaseToolkit\n", + "# from langchain.agents import AgentExecutor\n", + "from langchain.agents.agent_types import AgentType\n", + "\n", + "db = SQLDatabase.from_uri(\"sqlite:///Chinook.db\")\n", + "llm = OpenAI(temperature=0, verbose=True)\n", + "\n", + "agent_executor = create_sql_agent(\n", + " llm=OpenAI(temperature=0),\n", + " toolkit=SQLDatabaseToolkit(db=db, llm=OpenAI(temperature=0)),\n", + " verbose=True,\n", + " agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Agent task example #1 - Running queries\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", + "\u001b[32;1m\u001b[1;3mAction: sql_db_list_tables\n", + "Action Input: \u001b[0m\n", + "Observation: \u001b[38;5;200m\u001b[1;3mAlbum, Artist, Customer, Employee, Genre, Invoice, InvoiceLine, MediaType, Playlist, PlaylistTrack, Track\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3m I should query the schema of the Invoice and Customer tables.\n", + "Action: sql_db_schema\n", + "Action Input: Invoice, Customer\u001b[0m\n", + "Observation: \u001b[33;1m\u001b[1;3m\n", + "CREATE TABLE \"Customer\" (\n", + "\t\"CustomerId\" INTEGER NOT NULL, \n", + "\t\"FirstName\" NVARCHAR(40) NOT NULL, \n", + "\t\"LastName\" NVARCHAR(20) NOT NULL, \n", + "\t\"Company\" NVARCHAR(80), \n", + "\t\"Address\" NVARCHAR(70), \n", + "\t\"City\" NVARCHAR(40), \n", + "\t\"State\" NVARCHAR(40), \n", + "\t\"Country\" NVARCHAR(40), \n", + "\t\"PostalCode\" NVARCHAR(10), \n", + "\t\"Phone\" NVARCHAR(24), \n", + "\t\"Fax\" NVARCHAR(24), \n", + "\t\"Email\" NVARCHAR(60) NOT NULL, \n", + "\t\"SupportRepId\" INTEGER, \n", + "\tPRIMARY KEY (\"CustomerId\"), \n", + "\tFOREIGN KEY(\"SupportRepId\") REFERENCES \"Employee\" (\"EmployeeId\")\n", + ")\n", + "\n", + "/*\n", + "3 rows from Customer table:\n", + "CustomerId\tFirstName\tLastName\tCompany\tAddress\tCity\tState\tCountry\tPostalCode\tPhone\tFax\tEmail\tSupportRepId\n", + "1\tLuís\tGonçalves\tEmbraer - Empresa Brasileira de Aeronáutica S.A.\tAv. Brigadeiro Faria Lima, 2170\tSão José dos Campos\tSP\tBrazil\t12227-000\t+55 (12) 3923-5555\t+55 (12) 3923-5566\tluisg@embraer.com.br\t3\n", + "2\tLeonie\tKöhler\tNone\tTheodor-Heuss-Straße 34\tStuttgart\tNone\tGermany\t70174\t+49 0711 2842222\tNone\tleonekohler@surfeu.de\t5\n", + "3\tFrançois\tTremblay\tNone\t1498 rue Bélanger\tMontréal\tQC\tCanada\tH2G 1A7\t+1 (514) 721-4711\tNone\tftremblay@gmail.com\t3\n", + "*/\n", + "\n", + "\n", + "CREATE TABLE \"Invoice\" (\n", + "\t\"InvoiceId\" INTEGER NOT NULL, \n", + "\t\"CustomerId\" INTEGER NOT NULL, \n", + "\t\"InvoiceDate\" DATETIME NOT NULL, \n", + "\t\"BillingAddress\" NVARCHAR(70), \n", + "\t\"BillingCity\" NVARCHAR(40), \n", + "\t\"BillingState\" NVARCHAR(40), \n", + "\t\"BillingCountry\" NVARCHAR(40), \n", + "\t\"BillingPostalCode\" NVARCHAR(10), \n", + "\t\"Total\" NUMERIC(10, 2) NOT NULL, \n", + "\tPRIMARY KEY (\"InvoiceId\"), \n", + "\tFOREIGN KEY(\"CustomerId\") REFERENCES \"Customer\" (\"CustomerId\")\n", + ")\n", + "\n", + "/*\n", + "3 rows from Invoice table:\n", + "InvoiceId\tCustomerId\tInvoiceDate\tBillingAddress\tBillingCity\tBillingState\tBillingCountry\tBillingPostalCode\tTotal\n", + "1\t2\t2009-01-01 00:00:00\tTheodor-Heuss-Straße 34\tStuttgart\tNone\tGermany\t70174\t1.98\n", + "2\t4\t2009-01-02 00:00:00\tUllevålsveien 14\tOslo\tNone\tNorway\t0171\t3.96\n", + "3\t8\t2009-01-03 00:00:00\tGrétrystraat 63\tBrussels\tNone\tBelgium\t1000\t5.94\n", + "*/\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3m I should query the total sales per country.\n", + "Action: sql_db_query\n", + "Action Input: SELECT Country, SUM(Total) AS TotalSales FROM Invoice INNER JOIN Customer ON Invoice.CustomerId = Customer.CustomerId GROUP BY Country ORDER BY TotalSales DESC LIMIT 10\u001b[0m\n", + "Observation: \u001b[36;1m\u001b[1;3m[('USA', 523.0600000000003), ('Canada', 303.9599999999999), ('France', 195.09999999999994), ('Brazil', 190.09999999999997), ('Germany', 156.48), ('United Kingdom', 112.85999999999999), ('Czech Republic', 90.24000000000001), ('Portugal', 77.23999999999998), ('India', 75.25999999999999), ('Chile', 46.62)]\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3m I now know the final answer\n", + "Final Answer: The country with the highest total sales is the USA, with a total of $523.06.\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "'The country with the highest total sales is the USA, with a total of $523.06.'" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agent_executor.run(\n", + " \"List the total sales per country. Which country's customers spent the most?\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Looking at the [LangSmith trace](https://smith.langchain.com/public/a86dbe17-5782-4020-bce6-2de85343168a/r), we can see:\n", + "\n", + "* The agent is using a ReAct style prompt\n", + "* First, it will look at the tables: `Action: sql_db_list_tables` using tool `sql_db_list_tables`\n", + "* Given the tables as an observation, it `thinks` and then determinates the next `action`:\n", + "\n", + "```\n", + "Observation: Album, Artist, Customer, Employee, Genre, Invoice, InvoiceLine, MediaType, Playlist, PlaylistTrack, Track\n", + "Thought: I should query the schema of the Invoice and Customer tables.\n", + "Action: sql_db_schema\n", + "Action Input: Invoice, Customer\n", + "```\n", + "\n", + "* It then formulates the query using the schema from tool `sql_db_schema`\n", + "\n", + "```\n", + "Thought: I should query the total sales per country.\n", + "Action: sql_db_query\n", + "Action Input: SELECT Country, SUM(Total) AS TotalSales FROM Invoice INNER JOIN Customer ON Invoice.CustomerId = Customer.CustomerId GROUP BY Country ORDER BY TotalSales DESC LIMIT 10\n", + "```\n", + "\n", + "* It finally executes the generated query using tool `sql_db_query`\n", + "\n", + "![sql_usecase.png](/img/SQLDatabaseToolkit.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Agent task example #2 - Describing a Table" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", + "\u001b[32;1m\u001b[1;3mAction: sql_db_list_tables\n", + "Action Input: \u001b[0m\n", + "Observation: \u001b[38;5;200m\u001b[1;3mAlbum, Artist, Customer, Employee, Genre, Invoice, InvoiceLine, MediaType, Playlist, PlaylistTrack, Track\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3m The PlaylistTrack table is the most relevant to the question.\n", + "Action: sql_db_schema\n", + "Action Input: PlaylistTrack\u001b[0m\n", + "Observation: \u001b[33;1m\u001b[1;3m\n", + "CREATE TABLE \"PlaylistTrack\" (\n", + "\t\"PlaylistId\" INTEGER NOT NULL, \n", + "\t\"TrackId\" INTEGER NOT NULL, \n", + "\tPRIMARY KEY (\"PlaylistId\", \"TrackId\"), \n", + "\tFOREIGN KEY(\"TrackId\") REFERENCES \"Track\" (\"TrackId\"), \n", + "\tFOREIGN KEY(\"PlaylistId\") REFERENCES \"Playlist\" (\"PlaylistId\")\n", + ")\n", + "\n", + "/*\n", + "3 rows from PlaylistTrack table:\n", + "PlaylistId\tTrackId\n", + "1\t3402\n", + "1\t3389\n", + "1\t3390\n", + "*/\u001b[0m\n", + "Thought:\u001b[32;1m\u001b[1;3m I now know the final answer\n", + "Final Answer: The PlaylistTrack table contains two columns, PlaylistId and TrackId, which are both integers and form a primary key. It also has two foreign keys, one to the Track table and one to the Playlist table.\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "'The PlaylistTrack table contains two columns, PlaylistId and TrackId, which are both integers and form a primary key. It also has two foreign keys, one to the Track table and one to the Playlist table.'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agent_executor.run(\"Describe the playlisttrack table\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Extending the SQL Toolkit\n", + "\n", + "Although the out-of-the-box SQL Toolkit contains the necessary tools to start working on a database, it is often the case that some extra tools may be useful for extending the agent's capabilities. This is particularly useful when trying to use **domain specific knowledge** in the solution, in order to improve its overall performance.\n", + "\n", + "Some examples include:\n", + "\n", + "- Including dynamic few shot examples\n", + "- Finding misspellings in proper nouns to use as column filters\n", + "\n", + "We can create separate tools which tackle these specific use cases and include them as a complement to the standard SQL Toolkit. Let's see how to include these two custom tools.\n", + "\n", + "#### Including dynamic few-shot examples\n", + "\n", + "In order to include dynamic few-shot examples, we need a custom **Retriever Tool** that handles the vector database in order to retrieve the examples that are semantically similar to the user’s question.\n", + "\n", + "Let's start by creating a dictionary with some examples: " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# few_shots = {'List all artists.': 'SELECT * FROM artists;',\n", + "# \"Find all albums for the artist 'AC/DC'.\": \"SELECT * FROM albums WHERE ArtistId = (SELECT ArtistId FROM artists WHERE Name = 'AC/DC');\",\n", + "# \"List all tracks in the 'Rock' genre.\": \"SELECT * FROM tracks WHERE GenreId = (SELECT GenreId FROM genres WHERE Name = 'Rock');\",\n", + "# 'Find the total duration of all tracks.': 'SELECT SUM(Milliseconds) FROM tracks;',\n", + "# 'List all customers from Canada.': \"SELECT * FROM customers WHERE Country = 'Canada';\",\n", + "# 'How many tracks are there in the album with ID 5?': 'SELECT COUNT(*) FROM tracks WHERE AlbumId = 5;',\n", + "# 'Find the total number of invoices.': 'SELECT COUNT(*) FROM invoices;',\n", + "# 'List all tracks that are longer than 5 minutes.': 'SELECT * FROM tracks WHERE Milliseconds > 300000;',\n", + "# 'Who are the top 5 customers by total purchase?': 'SELECT CustomerId, SUM(Total) AS TotalPurchase FROM invoices GROUP BY CustomerId ORDER BY TotalPurchase DESC LIMIT 5;',\n", + "# 'Which albums are from the year 2000?': \"SELECT * FROM albums WHERE strftime('%Y', ReleaseDate) = '2000';\",\n", + "# 'How many employees are there': 'SELECT COUNT(*) FROM \"employee\"'\n", + "# }" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can then create a retriever using the list of questions, assigning the target SQL query as metadata:" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.vectorstores import FAISS\n", + "from langchain.schema import Document\n", + "\n", + "embeddings = OpenAIEmbeddings()\n", + "\n", + "few_shot_docs = [Document(page_content=question, metadata={'sql_query': few_shots[question]}) for question in few_shots.keys()]\n", + "vector_db = FAISS.from_documents(few_shot_docs, embeddings)\n", + "retriever = vector_db.as_retriever()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can create our own custom tool and append it as a new tool in the `create_sql_agent` function:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.agents.agent_toolkits import create_retriever_tool\n", + "\n", + "tool_description = \"\"\"\n", + "This tool will help you understand similar examples to adapt them to the user question.\n", + "Input to this tool should be the user question.\n", + "\"\"\"\n", + "\n", + "retriever_tool = create_retriever_tool(\n", + " retriever,\n", + " name='sql_get_similar_examples',\n", + " description=tool_description\n", + " )\n", + "custom_tool_list = [retriever_tool]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can create the agent, adjusting the standard SQL Agent suffix to consider our use case. Although the most straightforward way to handle this would be to include it just in the tool description, this is often not enough and we need to specify it in the agent prompt using the `suffix` argument in the constructor." + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.agents import create_sql_agent, AgentType\n", + "from langchain.agents.agent_toolkits import SQLDatabaseToolkit\n", + "from langchain.utilities import SQLDatabase\n", + "from langchain.chat_models import ChatOpenAI\n", + "\n", + "db = SQLDatabase.from_uri(\"sqlite:///Chinook.db\")\n", + "llm = ChatOpenAI(model_name='gpt-4',temperature=0)\n", + "\n", + "toolkit = SQLDatabaseToolkit(db=db, llm=llm)\n", + "\n", + "custom_suffix = \"\"\"\n", + "I should first get the similar examples I know.\n", + "If the examples are enough to construct the query, I can build it.\n", + "Otherwise, I can then look at the tables in the database to see what I can query.\n", + "Then I should query the schema of the most relevant tables\n", + "\"\"\"\n", + "\n", + "agent = create_sql_agent(llm=llm,\n", + " toolkit=toolkit,\n", + " verbose=True,\n", + " agent_type=AgentType.OPENAI_FUNCTIONS,\n", + " extra_tools=custom_tool_list,\n", + " suffix=custom_suffix\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's try it out:" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", + "\u001b[32;1m\u001b[1;3m\n", + "Invoking: `sql_get_similar_examples` with `How many employees do we have?`\n", + "\n", + "\n", + "\u001b[0m\u001b[33;1m\u001b[1;3m[Document(page_content='How many employees are there', metadata={'sql_query': 'SELECT COUNT(*) FROM \"employee\"'}), Document(page_content='Find the total number of invoices.', metadata={'sql_query': 'SELECT COUNT(*) FROM invoices;'})]\u001b[0m\u001b[32;1m\u001b[1;3m\n", + "Invoking: `sql_db_query_checker` with `SELECT COUNT(*) FROM employee`\n", + "responded: {content}\n", + "\n", + "\u001b[0m\u001b[36;1m\u001b[1;3mSELECT COUNT(*) FROM employee\u001b[0m\u001b[32;1m\u001b[1;3m\n", + "Invoking: `sql_db_query` with `SELECT COUNT(*) FROM employee`\n", + "\n", + "\n", + "\u001b[0m\u001b[36;1m\u001b[1;3m[(8,)]\u001b[0m\u001b[32;1m\u001b[1;3mWe have 8 employees.\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "'We have 8 employees.'" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agent.run(\"How many employees do we have?\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see, the agent first used the `sql_get_similar_examples` tool in order to retrieve similar examples. As the question was very similar to other few shot examples, the agent **didn't need to use any other tool** from the standard Toolkit, thus **saving time and tokens**." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Finding and correcting misspellings for proper nouns\n", + "\n", + "In order to filter columns that contain proper nouns such as addresses, song names or artists, we first need to double-check the spelling in order to filter the data correctly. \n", + "\n", + "We can achieve this by creating a vector store using all the distinct proper nouns that exist in the database. We can then have the agent query that vector store each time the user includes a proper noun in their question, to find the correct spelling for that word. In this way, the agent can make sure it understands which entity the user is referring to before building the target query.\n", + "\n", + "Let's follow a similar approach to the few shots, but without metadata: just embedding the proper nouns and then querying to get the most similar one to the misspelled user question.\n", + "\n", + "First we need the unique values for each entity we want, for which we define a function that parses the result into a list of elements:" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import re\n", + "\n", + "def run_query_save_results(db, query):\n", + " res = db.run(query)\n", + " res = [el for sub in ast.literal_eval(res) for el in sub if el]\n", + " res = [re.sub(r'\\b\\d+\\b', '', string).strip() for string in res]\n", + " return res\n", + "\n", + "artists = run_query_save_results(db, \"SELECT Name FROM Artist\")\n", + "albums = run_query_save_results(db, \"SELECT Title FROM Album\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can proceed with creating the custom **retreiver tool** and the final agent:" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.agents.agent_toolkits import create_retriever_tool\n", + "from langchain.embeddings.openai import OpenAIEmbeddings\n", + "from langchain.vectorstores import FAISS\n", + "\n", + "\n", + "texts = (artists + albums)\n", + "\n", + "embeddings = OpenAIEmbeddings()\n", + "vector_db = FAISS.from_texts(texts, embeddings)\n", + "retriever = vector_db.as_retriever()\n", + "\n", + "retriever_tool = create_retriever_tool(\n", + " retriever,\n", + " name='name_search',\n", + " description='use to learn how a piece of data is actually written, can be from names, surnames addresses etc'\n", + " )\n", + "\n", + "custom_tool_list = [retriever_tool]" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.agents import create_sql_agent, AgentType\n", + "from langchain.agents.agent_toolkits import SQLDatabaseToolkit\n", + "from langchain.utilities import SQLDatabase\n", + "from langchain.chat_models import ChatOpenAI\n", + "\n", + "# db = SQLDatabase.from_uri(\"sqlite:///Chinook.db\")\n", + "llm = ChatOpenAI(model_name='gpt-4', temperature=0)\n", + "\n", + "toolkit = SQLDatabaseToolkit(db=db, llm=llm)\n", + "\n", + "custom_suffix = \"\"\"\n", + "If a user asks for me to filter based on proper nouns, I should first check the spelling using the name_search tool.\n", + "Otherwise, I can then look at the tables in the database to see what I can query.\n", + "Then I should query the schema of the most relevant tables\n", + "\"\"\"\n", + "\n", + "agent = create_sql_agent(llm=llm,\n", + " toolkit=toolkit,\n", + " verbose=True,\n", + " agent_type=AgentType.OPENAI_FUNCTIONS,\n", + " extra_tools=custom_tool_list,\n", + " suffix=custom_suffix\n", + " )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's try it out:" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", + "\u001b[32;1m\u001b[1;3m\n", + "Invoking: `name_search` with `alis in pains`\n", + "\n", + "\n", + "\u001b[0m\u001b[33;1m\u001b[1;3m[Document(page_content='House of Pain', metadata={}), Document(page_content='Alice In Chains', metadata={}), Document(page_content='Aisha Duo', metadata={}), Document(page_content='House Of Pain', metadata={})]\u001b[0m\u001b[32;1m\u001b[1;3m\n", + "Invoking: `sql_db_list_tables` with ``\n", + "responded: {content}\n", + "\n", + "\u001b[0m\u001b[38;5;200m\u001b[1;3mAlbum, Artist, Customer, Employee, Genre, Invoice, InvoiceLine, MediaType, Playlist, PlaylistTrack, Track\u001b[0m\u001b[32;1m\u001b[1;3m\n", + "Invoking: `sql_db_schema` with `Album, Artist`\n", + "responded: {content}\n", + "\n", + "\u001b[0m\u001b[33;1m\u001b[1;3m\n", + "CREATE TABLE \"Album\" (\n", + "\t\"AlbumId\" INTEGER NOT NULL, \n", + "\t\"Title\" NVARCHAR(160) NOT NULL, \n", + "\t\"ArtistId\" INTEGER NOT NULL, \n", + "\tPRIMARY KEY (\"AlbumId\"), \n", + "\tFOREIGN KEY(\"ArtistId\") REFERENCES \"Artist\" (\"ArtistId\")\n", + ")\n", + "\n", + "/*\n", + "3 rows from Album table:\n", + "AlbumId\tTitle\tArtistId\n", + "1\tFor Those About To Rock We Salute You\t1\n", + "2\tBalls to the Wall\t2\n", + "3\tRestless and Wild\t2\n", + "*/\n", + "\n", + "\n", + "CREATE TABLE \"Artist\" (\n", + "\t\"ArtistId\" INTEGER NOT NULL, \n", + "\t\"Name\" NVARCHAR(120), \n", + "\tPRIMARY KEY (\"ArtistId\")\n", + ")\n", + "\n", + "/*\n", + "3 rows from Artist table:\n", + "ArtistId\tName\n", + "1\tAC/DC\n", + "2\tAccept\n", + "3\tAerosmith\n", + "*/\u001b[0m\u001b[32;1m\u001b[1;3m\n", + "Invoking: `sql_db_query_checker` with `SELECT COUNT(*) FROM Album JOIN Artist ON Album.ArtistId = Artist.ArtistId WHERE Artist.Name = 'Alice In Chains'`\n", + "responded: {content}\n", + "\n", + "\u001b[0m\u001b[36;1m\u001b[1;3mSELECT COUNT(*) FROM Album JOIN Artist ON Album.ArtistId = Artist.ArtistId WHERE Artist.Name = 'Alice In Chains'\u001b[0m\u001b[32;1m\u001b[1;3m\n", + "Invoking: `sql_db_query` with `SELECT COUNT(*) FROM Album JOIN Artist ON Album.ArtistId = Artist.ArtistId WHERE Artist.Name = 'Alice In Chains'`\n", + "\n", + "\n", + "\u001b[0m\u001b[36;1m\u001b[1;3m[(1,)]\u001b[0m\u001b[32;1m\u001b[1;3mAlice In Chains has 1 album in the database.\u001b[0m\n", + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "'Alice In Chains has 1 album in the database.'" + ] + }, + "execution_count": 55, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "agent.run(\"How many albums does alis in pains have?\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see, the agent used the `name_search` tool in order to check how to correctly query the database for this specific artist." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Go deeper\n", + "\n", + "To learn more about the SQL Agent and how it works we refer to the [SQL Agent Toolkit](/docs/integrations/toolkits/sql_database) documentation.\n", + "\n", + "You can also check Agents for other document types:\n", + "- [Pandas Agent](/docs/integrations/toolkits/pandas.html)\n", + "- [CSV Agent](/docs/integrations/toolkits/csv.html)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Elastic Search\n", + "\n", + "Going beyond the above use-case, there are integrations with other databases.\n", + "\n", + "For example, we can interact with Elasticsearch analytics database. \n", + "\n", + "This chain builds search queries via the Elasticsearch DSL API (filters and aggregations).\n", + "\n", + "The Elasticsearch client must have permissions for index listing, mapping description and search queries.\n", + "\n", + "See [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html) for instructions on how to run Elasticsearch locally.\n", + "\n", + "Make sure to install the Elasticsearch Python client before:\n", + "\n", + "```sh\n", + "pip install elasticsearch\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "from elasticsearch import Elasticsearch\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.chains.elasticsearch_database import ElasticsearchDatabaseChain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Initialize Elasticsearch python client.\n", + "# See https://elasticsearch-py.readthedocs.io/en/v8.8.2/api.html#elasticsearch.Elasticsearch\n", + "ELASTIC_SEARCH_SERVER = \"https://elastic:pass@localhost:9200\"\n", + "db = Elasticsearch(ELASTIC_SEARCH_SERVER)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Uncomment the next cell to initially populate your db." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# customers = [\n", + "# {\"firstname\": \"Jennifer\", \"lastname\": \"Walters\"},\n", + "# {\"firstname\": \"Monica\",\"lastname\":\"Rambeau\"},\n", + "# {\"firstname\": \"Carol\",\"lastname\":\"Danvers\"},\n", + "# {\"firstname\": \"Wanda\",\"lastname\":\"Maximoff\"},\n", + "# {\"firstname\": \"Jennifer\",\"lastname\":\"Takeda\"},\n", + "# ]\n", + "# for i, customer in enumerate(customers):\n", + "# db.create(index=\"customers\", document=customer, id=i)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "llm = ChatOpenAI(model_name=\"gpt-4\", temperature=0)\n", + "chain = ElasticsearchDatabaseChain.from_llm(llm=llm, database=db, verbose=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "question = \"What are the first names of all the customers?\"\n", + "chain.run(question)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can customize the prompt." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains.elasticsearch_database.prompts import DEFAULT_DSL_TEMPLATE\n", + "from langchain.prompts.prompt import PromptTemplate\n", + "\n", + "PROMPT_TEMPLATE = \"\"\"Given an input question, create a syntactically correct Elasticsearch query to run. Unless the user specifies in their question a specific number of examples they wish to obtain, always limit your query to at most {top_k} results. You can order the results by a relevant column to return the most interesting examples in the database.\n", + "\n", + "Unless told to do not query for all the columns from a specific index, only ask for a the few relevant columns given the question.\n", + "\n", + "Pay attention to use only the column names that you can see in the mapping description. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which index. Return the query as valid json.\n", + "\n", + "Use the following format:\n", + "\n", + "Question: Question here\n", + "ESQuery: Elasticsearch Query formatted as json\n", + "\"\"\"\n", + "\n", + "PROMPT = PromptTemplate.from_template(\n", + " PROMPT_TEMPLATE,\n", + ")\n", + "chain = ElasticsearchDatabaseChain.from_llm(llm=llm, database=db, query_prompt=PROMPT)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/docs/extras/use_cases/sql.ipynb b/docs/extras/use_cases/sql/sql.ipynb similarity index 100% rename from docs/extras/use_cases/sql.ipynb rename to docs/extras/use_cases/sql/sql.ipynb diff --git a/docs/extras/integrations/tools/sqlite.mdx b/docs/extras/use_cases/sql/sqlite.mdx similarity index 100% rename from docs/extras/integrations/tools/sqlite.mdx rename to docs/extras/use_cases/sql/sqlite.mdx diff --git a/docs/extras/use_cases/summarization.ipynb b/docs/extras/use_cases/summarization.ipynb index 000ba48124..6d7e118ab7 100644 --- a/docs/extras/use_cases/summarization.ipynb +++ b/docs/extras/use_cases/summarization.ipynb @@ -1,12 +1,21 @@ { "cells": [ + { + "cell_type": "raw", + "id": "2aca8168-62ec-4bba-93f0-73da08cd1920", + "metadata": {}, + "source": [ + "---\n", + "sidebar_position: 1\n", + "title: Summarization\n", + "---" + ] + }, { "cell_type": "markdown", "id": "cf13f702", "metadata": {}, "source": [ - "# Summarization\n", - "\n", "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/summarization.ipynb)\n", "\n", "## Use case\n", @@ -548,7 +557,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/extras/use_cases/tagging.ipynb b/docs/extras/use_cases/tagging.ipynb index 235f9d06cb..37242a84f5 100644 --- a/docs/extras/use_cases/tagging.ipynb +++ b/docs/extras/use_cases/tagging.ipynb @@ -1,12 +1,21 @@ { "cells": [ + { + "cell_type": "raw", + "id": "cb6f552e-775f-4d84-bc7c-dca94c06a33c", + "metadata": {}, + "source": [ + "---\n", + "sidebar_position: 1\n", + "title: Tagging\n", + "---" + ] + }, { "cell_type": "markdown", "id": "a0507a4b", "metadata": {}, "source": [ - "# Tagging\n", - "\n", "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/tagging.ipynb)\n", "\n", "## Use case\n", @@ -408,7 +417,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/extras/use_cases/web_scraping.ipynb b/docs/extras/use_cases/web_scraping.ipynb index 57c9e8387a..41bb28703e 100644 --- a/docs/extras/use_cases/web_scraping.ipynb +++ b/docs/extras/use_cases/web_scraping.ipynb @@ -1,12 +1,21 @@ { "cells": [ + { + "cell_type": "raw", + "id": "e254cf03-49fc-4051-a4df-3a8e4e7d2688", + "metadata": {}, + "source": [ + "---\n", + "sidebar_position: 1\n", + "title: Web scraping\n", + "---" + ] + }, { "cell_type": "markdown", "id": "6605e7f7", "metadata": {}, "source": [ - "# Web Scraping\n", - "\n", "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/web_scraping.ipynb)\n", "\n", "## Use case\n", @@ -306,9 +315,7 @@ "cell_type": "code", "execution_count": 7, "id": "977560ba", - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [ { "name": "stdout", @@ -591,7 +598,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/integrations/vearch.md b/docs/integrations/vearch.md new file mode 100644 index 0000000000..da61bec98c --- /dev/null +++ b/docs/integrations/vearch.md @@ -0,0 +1,15 @@ +# Vearch + +Vearch is a scalable distributed system for efficient similarity search of deep learning vectors. + +# Installation and Setup + +Vearch Python SDK enables vearch to use locally. Vearch python sdk can be installed easily by pip install vearch. + +# Vectorstore + +Vearch also can used as vectorstore. Most detalis in [this notebook](docs/modules/indexes/vectorstores/examples/vearch.ipynb) + +```python +from langchain.vectorstores import Vearch +``` diff --git a/docs/snippets/modules/agents/agent_types/openai_functions_agent.mdx b/docs/snippets/modules/agents/agent_types/openai_functions_agent.mdx index a04e2ac8c7..aaf208dc05 100644 --- a/docs/snippets/modules/agents/agent_types/openai_functions_agent.mdx +++ b/docs/snippets/modules/agents/agent_types/openai_functions_agent.mdx @@ -5,10 +5,12 @@ pip install openai google-search-results ``` ```python -from langchain import LLMMathChain, OpenAI, SerpAPIWrapper, SQLDatabase, SQLDatabaseChain -from langchain.agents import initialize_agent, Tool -from langchain.agents import AgentType +from langchain.agents import initialize_agent, AgentType, Tool +from langchain.chains import LLMMathChain from langchain.chat_models import ChatOpenAI +from langchain.llms import OpenAI +from langchain.utilities import SerpAPIWrapper, SQLDatabase +from langchain_experimental.sql import SQLDatabaseChain ``` diff --git a/libs/experimental/langchain_experimental/data_anonymizer/__init__.py b/libs/experimental/langchain_experimental/data_anonymizer/__init__.py index 69babad859..f43d6d98df 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/__init__.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/__init__.py @@ -1,4 +1,7 @@ """Data anonymizer package""" -from langchain_experimental.data_anonymizer.presidio import PresidioAnonymizer +from langchain_experimental.data_anonymizer.presidio import ( + PresidioAnonymizer, + PresidioReversibleAnonymizer, +) -__all__ = ["PresidioAnonymizer"] +__all__ = ["PresidioAnonymizer", "PresidioReversibleAnonymizer"] diff --git a/libs/experimental/langchain_experimental/data_anonymizer/base.py b/libs/experimental/langchain_experimental/data_anonymizer/base.py index 3f9905375e..292d2a2a0f 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/base.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/base.py @@ -1,4 +1,5 @@ from abc import ABC, abstractmethod +from typing import Optional class AnonymizerBase(ABC): @@ -8,10 +9,24 @@ class AnonymizerBase(ABC): wrapping the behavior for all methods in a base class. """ - def anonymize(self, text: str) -> str: + def anonymize(self, text: str, language: Optional[str] = None) -> str: """Anonymize text""" - return self._anonymize(text) + return self._anonymize(text, language) @abstractmethod - def _anonymize(self, text: str) -> str: + def _anonymize(self, text: str, language: Optional[str]) -> str: """Abstract method to anonymize text""" + + +class ReversibleAnonymizerBase(AnonymizerBase): + """ + Base abstract class for reversible anonymizers. + """ + + def deanonymize(self, text: str) -> str: + """Deanonymize text""" + return self._deanonymize(text) + + @abstractmethod + def _deanonymize(self, text: str) -> str: + """Abstract method to deanonymize text""" diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py new file mode 100644 index 0000000000..2ee03eb208 --- /dev/null +++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_mapping.py @@ -0,0 +1,21 @@ +from collections import defaultdict +from dataclasses import dataclass, field +from typing import Dict + +MappingDataType = Dict[str, Dict[str, str]] + + +@dataclass +class DeanonymizerMapping: + mapping: MappingDataType = field( + default_factory=lambda: defaultdict(lambda: defaultdict(str)) + ) + + @property + def data(self) -> MappingDataType: + """Return the deanonymizer mapping""" + return {k: dict(v) for k, v in self.mapping.items()} + + def update(self, new_mapping: MappingDataType) -> None: + for entity_type, values in new_mapping.items(): + self.mapping[entity_type].update(values) diff --git a/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py new file mode 100644 index 0000000000..e5d9e8581b --- /dev/null +++ b/libs/experimental/langchain_experimental/data_anonymizer/deanonymizer_matching_strategies.py @@ -0,0 +1,17 @@ +from langchain_experimental.data_anonymizer.presidio import MappingDataType + + +def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str: + """ + Default matching strategy for deanonymization. + It replaces all the anonymized entities with the original ones. + + Args: + text: text to deanonymize + deanonymizer_mapping: mapping between anonymized entities and original ones""" + + # Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.) + for entity_type in deanonymizer_mapping: + for anonymized, original in deanonymizer_mapping[entity_type].items(): + text = text.replace(anonymized, original) + return text diff --git a/libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py b/libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py index 8db4f94c2f..9015679f20 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/faker_presidio_mapping.py @@ -1,8 +1,8 @@ import string -from typing import Callable, Dict +from typing import Callable, Dict, Optional -def get_pseudoanonymizer_mapping() -> Dict[str, Callable]: +def get_pseudoanonymizer_mapping(seed: Optional[int] = None) -> Dict[str, Callable]: try: from faker import Faker except ImportError as e: @@ -11,6 +11,7 @@ def get_pseudoanonymizer_mapping() -> Dict[str, Callable]: ) from e fake = Faker() + fake.seed_instance(seed) # Listed entities supported by Microsoft Presidio (for now, global and US only) # Source: https://microsoft.github.io/presidio/supported_entities/ @@ -26,8 +27,8 @@ def get_pseudoanonymizer_mapping() -> Dict[str, Callable]: fake.random_choices(string.ascii_lowercase + string.digits, length=26) ), "IP_ADDRESS": lambda _: fake.ipv4_public(), - "LOCATION": lambda _: fake.address(), - "DATE_TIME": lambda _: fake.iso8601(), + "LOCATION": lambda _: fake.city(), + "DATE_TIME": lambda _: fake.date(), "NRP": lambda _: str(fake.random_number(digits=8, fix_len=True)), "MEDICAL_LICENSE": lambda _: fake.bothify(text="??######").upper(), "URL": lambda _: fake.url(), diff --git a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py index 298e3de1d5..b2be1dc5a1 100644 --- a/libs/experimental/langchain_experimental/data_anonymizer/presidio.py +++ b/libs/experimental/langchain_experimental/data_anonymizer/presidio.py @@ -1,24 +1,75 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Dict, List, Optional +import json +from collections import defaultdict +from pathlib import Path +from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union -from langchain_experimental.data_anonymizer.base import AnonymizerBase +import yaml + +from langchain_experimental.data_anonymizer.base import ( + AnonymizerBase, + ReversibleAnonymizerBase, +) +from langchain_experimental.data_anonymizer.deanonymizer_mapping import ( + DeanonymizerMapping, + MappingDataType, +) +from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import ( + default_matching_strategy, +) from langchain_experimental.data_anonymizer.faker_presidio_mapping import ( get_pseudoanonymizer_mapping, ) -if TYPE_CHECKING: - from presidio_analyzer import EntityRecognizer +try: + from presidio_analyzer import AnalyzerEngine + from presidio_analyzer.nlp_engine import NlpEngineProvider + +except ImportError as e: + raise ImportError( + "Could not import presidio_analyzer, please install with " + "`pip install presidio-analyzer`. You will also need to download a " + "spaCy model to use the analyzer, e.g. " + "`python -m spacy download en_core_web_lg`." + ) from e +try: + from presidio_anonymizer import AnonymizerEngine from presidio_anonymizer.entities import OperatorConfig +except ImportError as e: + raise ImportError( + "Could not import presidio_anonymizer, please install with " + "`pip install presidio-anonymizer`." + ) from e +if TYPE_CHECKING: + from presidio_analyzer import EntityRecognizer, RecognizerResult + from presidio_anonymizer.entities import EngineResult -class PresidioAnonymizer(AnonymizerBase): - """Anonymizer using Microsoft Presidio.""" +# Configuring Anonymizer for multiple languages +# Detailed description and examples can be found here: +# langchain/docs/extras/guides/privacy/multi_language_anonymization.ipynb +DEFAULT_LANGUAGES_CONFIG = { + # You can also use Stanza or transformers library. + # See https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/ + "nlp_engine_name": "spacy", + "models": [ + {"lang_code": "en", "model_name": "en_core_web_lg"}, + # {"lang_code": "de", "model_name": "de_core_news_md"}, + # {"lang_code": "es", "model_name": "es_core_news_md"}, + # ... + # List of available models: https://spacy.io/usage/models + ], +} + +class PresidioAnonymizerBase(AnonymizerBase): def __init__( self, analyzed_fields: Optional[List[str]] = None, operators: Optional[Dict[str, OperatorConfig]] = None, + languages_config: Dict = DEFAULT_LANGUAGES_CONFIG, + faker_seed: Optional[int] = None, ): """ Args: @@ -28,25 +79,15 @@ class PresidioAnonymizer(AnonymizerBase): Operators allow for custom anonymization of detected PII. Learn more: https://microsoft.github.io/presidio/tutorial/10_simple_anonymization/ + languages_config: Configuration for the NLP engine. + First language in the list will be used as the main language + in self.anonymize(...) when no language is specified. + Learn more: + https://microsoft.github.io/presidio/analyzer/customizing_nlp_models/ + faker_seed: Seed used to initialize faker. + Defaults to None, in which case faker will be seeded randomly + and provide random values. """ - try: - from presidio_analyzer import AnalyzerEngine - except ImportError as e: - raise ImportError( - "Could not import presidio_analyzer, please install with " - "`pip install presidio-analyzer`. You will also need to download a " - "spaCy model to use the analyzer, e.g. " - "`python -m spacy download en_core_web_lg`." - ) from e - try: - from presidio_anonymizer import AnonymizerEngine - from presidio_anonymizer.entities import OperatorConfig - except ImportError as e: - raise ImportError( - "Could not import presidio_anonymizer, please install with " - "`pip install presidio-anonymizer`." - ) from e - self.analyzed_fields = ( analyzed_fields if analyzed_fields is not None @@ -59,17 +100,66 @@ class PresidioAnonymizer(AnonymizerBase): field: OperatorConfig( operator_name="custom", params={"lambda": faker_function} ) - for field, faker_function in get_pseudoanonymizer_mapping().items() + for field, faker_function in get_pseudoanonymizer_mapping( + faker_seed + ).items() } ) - self._analyzer = AnalyzerEngine() + + provider = NlpEngineProvider(nlp_configuration=languages_config) + nlp_engine = provider.create_engine() + + self.supported_languages = list(nlp_engine.nlp.keys()) + + self._analyzer = AnalyzerEngine( + supported_languages=self.supported_languages, nlp_engine=nlp_engine + ) self._anonymizer = AnonymizerEngine() - def _anonymize(self, text: str) -> str: + def add_recognizer(self, recognizer: EntityRecognizer) -> None: + """Add a recognizer to the analyzer + + Args: + recognizer: Recognizer to add to the analyzer. + """ + self._analyzer.registry.add_recognizer(recognizer) + self.analyzed_fields.extend(recognizer.supported_entities) + + def add_operators(self, operators: Dict[str, OperatorConfig]) -> None: + """Add operators to the anonymizer + + Args: + operators: Operators to add to the anonymizer. + """ + self.operators.update(operators) + + +class PresidioAnonymizer(PresidioAnonymizerBase): + def _anonymize(self, text: str, language: Optional[str] = None) -> str: + """Anonymize text. + Each PII entity is replaced with a fake value. + Each time fake values will be different, as they are generated randomly. + + Args: + text: text to anonymize + language: language to use for analysis of PII + If None, the first (main) language in the list + of languages specified in the configuration will be used. + """ + if language is None: + language = self.supported_languages[0] + + if language not in self.supported_languages: + raise ValueError( + f"Language '{language}' is not supported. " + f"Supported languages are: {self.supported_languages}. " + "Change your language configuration file to add more languages." + ) + results = self._analyzer.analyze( text, entities=self.analyzed_fields, - language="en", + language=language, ) return self._anonymizer.anonymize( @@ -78,11 +168,199 @@ class PresidioAnonymizer(AnonymizerBase): operators=self.operators, ).text - def add_recognizer(self, recognizer: EntityRecognizer) -> None: - """Add a recognizer to the analyzer""" - self._analyzer.registry.add_recognizer(recognizer) - self.analyzed_fields.extend(recognizer.supported_entities) - def add_operators(self, operators: Dict[str, OperatorConfig]) -> None: - """Add operators to the anonymizer""" - self.operators.update(operators) +class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase): + def __init__( + self, + analyzed_fields: Optional[List[str]] = None, + operators: Optional[Dict[str, OperatorConfig]] = None, + languages_config: Dict = DEFAULT_LANGUAGES_CONFIG, + faker_seed: Optional[int] = None, + ): + super().__init__(analyzed_fields, operators, languages_config, faker_seed) + self._deanonymizer_mapping = DeanonymizerMapping() + + @property + def deanonymizer_mapping(self) -> MappingDataType: + """Return the deanonymizer mapping""" + return self._deanonymizer_mapping.data + + def _update_deanonymizer_mapping( + self, + original_text: str, + analyzer_results: List[RecognizerResult], + anonymizer_results: EngineResult, + ) -> None: + """Creates or updates the mapping used to de-anonymize text. + + This method exploits the results returned by the + analysis and anonymization processes. + + It constructs a mapping from each anonymized entity + back to its original text value. + + Mapping will be stored as "deanonymizer_mapping" property. + + Example of "deanonymizer_mapping": + { + "PERSON": { + "": "", + "John Doe": "Slim Shady" + }, + "PHONE_NUMBER": { + "111-111-1111": "555-555-5555" + } + ... + } + """ + + # We are able to zip and loop through both lists because we expect + # them to return corresponding entities for each identified piece + # of analyzable data from our input. + + # We sort them by their 'start' attribute because it allows us to + # match corresponding entities by their position in the input text. + analyzer_results = sorted(analyzer_results, key=lambda d: d.start) + anonymizer_results.items = sorted( + anonymizer_results.items, key=lambda d: d.start + ) + + new_deanonymizer_mapping: MappingDataType = defaultdict(dict) + + for analyzed_entity, anonymized_entity in zip( + analyzer_results, anonymizer_results.items + ): + original_value = original_text[analyzed_entity.start : analyzed_entity.end] + new_deanonymizer_mapping[anonymized_entity.entity_type][ + anonymized_entity.text + ] = original_value + + self._deanonymizer_mapping.update(new_deanonymizer_mapping) + + def _anonymize(self, text: str, language: Optional[str] = None) -> str: + """Anonymize text. + Each PII entity is replaced with a fake value. + Each time fake values will be different, as they are generated randomly. + At the same time, we will create a mapping from each anonymized entity + back to its original text value. + + Args: + text: text to anonymize + language: language to use for analysis of PII + If None, the first (main) language in the list + of languages specified in the configuration will be used. + """ + if language is None: + language = self.supported_languages[0] + + if language not in self.supported_languages: + raise ValueError( + f"Language '{language}' is not supported. " + f"Supported languages are: {self.supported_languages}. " + "Change your language configuration file to add more languages." + ) + + analyzer_results = self._analyzer.analyze( + text, + entities=self.analyzed_fields, + language=language, + ) + + filtered_analyzer_results = ( + self._anonymizer._remove_conflicts_and_get_text_manipulation_data( + analyzer_results + ) + ) + + anonymizer_results = self._anonymizer.anonymize( + text, + analyzer_results=analyzer_results, + operators=self.operators, + ) + + self._update_deanonymizer_mapping( + text, filtered_analyzer_results, anonymizer_results + ) + + return anonymizer_results.text + + def _deanonymize( + self, + text_to_deanonymize: str, + deanonymizer_matching_strategy: Callable[ + [str, MappingDataType], str + ] = default_matching_strategy, + ) -> str: + """Deanonymize text. + Each anonymized entity is replaced with its original value. + This method exploits the mapping created during the anonymization process. + + Args: + text_to_deanonymize: text to deanonymize + deanonymizer_matching_strategy: function to use to match + anonymized entities with their original values and replace them. + """ + if not self._deanonymizer_mapping: + raise ValueError( + "Deanonymizer mapping is empty.", + "Please call anonymize() and anonymize some text first.", + ) + + text_to_deanonymize = deanonymizer_matching_strategy( + text_to_deanonymize, self.deanonymizer_mapping + ) + + return text_to_deanonymize + + def save_deanonymizer_mapping(self, file_path: Union[Path, str]) -> None: + """Save the deanonymizer mapping to a JSON or YAML file. + + Args: + file_path: Path to file to save the mapping to. + + Example: + .. code-block:: python + + anonymizer.save_deanonymizer_mapping(file_path="path/mapping.json") + """ + + save_path = Path(file_path) + + if save_path.suffix not in [".json", ".yaml"]: + raise ValueError(f"{save_path} must have an extension of .json or .yaml") + + # Make sure parent directories exist + save_path.parent.mkdir(parents=True, exist_ok=True) + + if save_path.suffix == ".json": + with open(save_path, "w") as f: + json.dump(self.deanonymizer_mapping, f, indent=2) + elif save_path.suffix == ".yaml": + with open(save_path, "w") as f: + yaml.dump(self.deanonymizer_mapping, f, default_flow_style=False) + + def load_deanonymizer_mapping(self, file_path: Union[Path, str]) -> None: + """Load the deanonymizer mapping from a JSON or YAML file. + + Args: + file_path: Path to file to load the mapping from. + + Example: + .. code-block:: python + + anonymizer.load_deanonymizer_mapping(file_path="path/mapping.json") + """ + + load_path = Path(file_path) + + if load_path.suffix not in [".json", ".yaml"]: + raise ValueError(f"{load_path} must have an extension of .json or .yaml") + + if load_path.suffix == ".json": + with open(load_path, "r") as f: + loaded_mapping = json.load(f) + elif load_path.suffix == ".yaml": + with open(load_path, "r") as f: + loaded_mapping = yaml.load(f, Loader=yaml.FullLoader) + + self._deanonymizer_mapping.update(loaded_mapping) diff --git a/libs/experimental/langchain_experimental/graph_transformers/__init__.py b/libs/experimental/langchain_experimental/graph_transformers/__init__.py new file mode 100644 index 0000000000..3f6c8a665e --- /dev/null +++ b/libs/experimental/langchain_experimental/graph_transformers/__init__.py @@ -0,0 +1,5 @@ +from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer + +__all__ = [ + "DiffbotGraphTransformer", +] diff --git a/libs/experimental/langchain_experimental/graph_transformers/diffbot.py b/libs/experimental/langchain_experimental/graph_transformers/diffbot.py new file mode 100644 index 0000000000..000c70de4b --- /dev/null +++ b/libs/experimental/langchain_experimental/graph_transformers/diffbot.py @@ -0,0 +1,316 @@ +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union + +import requests +from langchain.graphs.graph_document import GraphDocument, Node, Relationship +from langchain.schema import Document +from langchain.utils import get_from_env + + +def format_property_key(s: str) -> str: + words = s.split() + if not words: + return s + first_word = words[0].lower() + capitalized_words = [word.capitalize() for word in words[1:]] + return "".join([first_word] + capitalized_words) + + +class NodesList: + """ + Manages a list of nodes with associated properties. + + Attributes: + nodes (Dict[Tuple, Any]): Stores nodes as keys and their properties as values. + Each key is a tuple where the first element is the + node ID and the second is the node type. + """ + + def __init__(self) -> None: + self.nodes: Dict[Tuple[Union[str, int], str], Any] = dict() + + def add_node_property( + self, node: Tuple[Union[str, int], str], properties: Dict[str, Any] + ) -> None: + """ + Adds or updates node properties. + + If the node does not exist in the list, it's added along with its properties. + If the node already exists, its properties are updated with the new values. + + Args: + node (Tuple): A tuple containing the node ID and node type. + properties (Dict): A dictionary of properties to add or update for the node. + """ + if node not in self.nodes: + self.nodes[node] = properties + else: + self.nodes[node].update(properties) + + def return_node_list(self) -> List[Node]: + """ + Returns the nodes as a list of Node objects. + + Each Node object will have its ID, type, and properties populated. + + Returns: + List[Node]: A list of Node objects. + """ + nodes = [ + Node(id=key[0], type=key[1], properties=self.nodes[key]) + for key in self.nodes + ] + return nodes + + +# Properties that should be treated as node properties instead of relationships +FACT_TO_PROPERTY_TYPE = [ + "Date", + "Number", + "Job title", + "Cause of death", + "Organization type", + "Academic title", +] + + +schema_mapping = [ + ("HEADQUARTERS", "ORGANIZATION_LOCATIONS"), + ("RESIDENCE", "PERSON_LOCATION"), + ("ALL_PERSON_LOCATIONS", "PERSON_LOCATION"), + ("CHILD", "HAS_CHILD"), + ("PARENT", "HAS_PARENT"), + ("CUSTOMERS", "HAS_CUSTOMER"), + ("SKILLED_AT", "INTERESTED_IN"), +] + + +class SimplifiedSchema: + """ + Provides functionality for working with a simplified schema mapping. + + Attributes: + schema (Dict): A dictionary containing the mapping to simplified schema types. + """ + + def __init__(self) -> None: + """Initializes the schema dictionary based on the predefined list.""" + self.schema = dict() + for row in schema_mapping: + self.schema[row[0]] = row[1] + + def get_type(self, type: str) -> str: + """ + Retrieves the simplified schema type for a given original type. + + Args: + type (str): The original schema type to find the simplified type for. + + Returns: + str: The simplified schema type if it exists; + otherwise, returns the original type. + """ + try: + return self.schema[type] + except KeyError: + return type + + +class DiffbotGraphTransformer: + """Transforms documents into graph documents using Diffbot's NLP API. + + A graph document transformation system takes a sequence of Documents and returns a + sequence of Graph Documents. + + Example: + .. code-block:: python + + class DiffbotGraphTransformer(BaseGraphDocumentTransformer): + + def transform_documents( + self, documents: Sequence[Document], **kwargs: Any + ) -> Sequence[GraphDocument]: + results = [] + + for document in documents: + raw_results = self.nlp_request(document.page_content) + graph_document = self.process_response(raw_results, document) + results.append(graph_document) + return results + + async def atransform_documents( + self, documents: Sequence[Document], **kwargs: Any + ) -> Sequence[Document]: + raise NotImplementedError + """ + + def __init__( + self, + diffbot_api_key: Optional[str] = None, + fact_confidence_threshold: float = 0.7, + include_qualifiers: bool = True, + include_evidence: bool = True, + simplified_schema: bool = True, + ) -> None: + """ + Initialize the graph transformer with various options. + + Args: + diffbot_api_key (str): + The API key for Diffbot's NLP services. + + fact_confidence_threshold (float): + Minimum confidence level for facts to be included. + include_qualifiers (bool): + Whether to include qualifiers in the relationships. + include_evidence (bool): + Whether to include evidence for the relationships. + simplified_schema (bool): + Whether to use a simplified schema for relationships. + """ + self.diffbot_api_key = diffbot_api_key or get_from_env( + "diffbot_api_key", "DIFFBOT_API_KEY" + ) + self.fact_threshold_confidence = fact_confidence_threshold + self.include_qualifiers = include_qualifiers + self.include_evidence = include_evidence + self.simplified_schema = None + if simplified_schema: + self.simplified_schema = SimplifiedSchema() + + def nlp_request(self, text: str) -> Dict[str, Any]: + """ + Make an API request to the Diffbot NLP endpoint. + + Args: + text (str): The text to be processed. + + Returns: + Dict[str, Any]: The JSON response from the API. + """ + + # Relationship extraction only works for English + payload = { + "content": text, + "lang": "en", + } + + FIELDS = "facts" + HOST = "nl.diffbot.com" + url = ( + f"https://{HOST}/v1/?fields={FIELDS}&" + f"token={self.diffbot_api_key}&language=en" + ) + result = requests.post(url, data=payload) + return result.json() + + def process_response( + self, payload: Dict[str, Any], document: Document + ) -> GraphDocument: + """ + Transform the Diffbot NLP response into a GraphDocument. + + Args: + payload (Dict[str, Any]): The JSON response from Diffbot's NLP API. + document (Document): The original document. + + Returns: + GraphDocument: The transformed document as a graph. + """ + + # Return empty result if there are no facts + if "facts" not in payload or not payload["facts"]: + return GraphDocument(nodes=[], relationships=[], source=document) + + # Nodes are a custom class because we need to deduplicate + nodes_list = NodesList() + # Relationships are a list because we don't deduplicate nor anything else + relationships = list() + for record in payload["facts"]: + # Skip if the fact is below the threshold confidence + if record["confidence"] < self.fact_threshold_confidence: + continue + + # TODO: It should probably be treated as a node property + if not record["value"]["allTypes"]: + continue + + # Define source node + source_id = ( + record["entity"]["allUris"][0] + if record["entity"]["allUris"] + else record["entity"]["name"] + ) + source_label = record["entity"]["allTypes"][0]["name"].capitalize() + source_name = record["entity"]["name"] + source_node = Node(id=source_id, type=source_label) + nodes_list.add_node_property( + (source_id, source_label), {"name": source_name} + ) + + # Define target node + target_id = ( + record["value"]["allUris"][0] + if record["value"]["allUris"] + else record["value"]["name"] + ) + target_label = record["value"]["allTypes"][0]["name"].capitalize() + target_name = record["value"]["name"] + # Some facts are better suited as node properties + if target_label in FACT_TO_PROPERTY_TYPE: + nodes_list.add_node_property( + (source_id, source_label), + {format_property_key(record["property"]["name"]): target_name}, + ) + else: # Define relationship + # Define target node object + target_node = Node(id=target_id, type=target_label) + nodes_list.add_node_property( + (target_id, target_label), {"name": target_name} + ) + # Define relationship type + rel_type = record["property"]["name"].replace(" ", "_").upper() + if self.simplified_schema: + rel_type = self.simplified_schema.get_type(rel_type) + + # Relationship qualifiers/properties + rel_properties = dict() + relationship_evidence = [el["passage"] for el in record["evidence"]][0] + if self.include_evidence: + rel_properties.update({"evidence": relationship_evidence}) + if self.include_qualifiers and record.get("qualifiers"): + for property in record["qualifiers"]: + prop_key = format_property_key(property["property"]["name"]) + rel_properties[prop_key] = property["value"]["name"] + + relationship = Relationship( + source=source_node, + target=target_node, + type=rel_type, + properties=rel_properties, + ) + relationships.append(relationship) + + return GraphDocument( + nodes=nodes_list.return_node_list(), + relationships=relationships, + source=document, + ) + + def convert_to_graph_documents( + self, documents: Sequence[Document] + ) -> List[GraphDocument]: + """Convert a sequence of documents into graph documents. + + Args: + documents (Sequence[Document]): The original documents. + **kwargs: Additional keyword arguments. + + Returns: + Sequence[GraphDocument]: The transformed documents as graphs. + """ + results = [] + for document in documents: + raw_results = self.nlp_request(document.page_content) + graph_document = self.process_response(raw_results, document) + results.append(graph_document) + return results diff --git a/libs/experimental/langchain_experimental/retrievers/vector_sql_database.py b/libs/experimental/langchain_experimental/retrievers/vector_sql_database.py new file mode 100644 index 0000000000..1ec088dbc5 --- /dev/null +++ b/libs/experimental/langchain_experimental/retrievers/vector_sql_database.py @@ -0,0 +1,38 @@ +"""Vector SQL Database Chain Retriever""" +from typing import Any, Dict, List + +from langchain.callbacks.manager import ( + AsyncCallbackManagerForRetrieverRun, + CallbackManagerForRetrieverRun, +) +from langchain.schema import BaseRetriever, Document + +from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain + + +class VectorSQLDatabaseChainRetriever(BaseRetriever): + """Retriever that uses SQLDatabase as Retriever""" + + sql_db_chain: VectorSQLDatabaseChain + """SQL Database Chain""" + page_content_key: str = "content" + """column name for page content of documents""" + + def _get_relevant_documents( + self, + query: str, + *, + run_manager: CallbackManagerForRetrieverRun, + **kwargs: Any, + ) -> List[Document]: + ret: List[Dict[str, Any]] = self.sql_db_chain( + query, callbacks=run_manager.get_child(), **kwargs + )["result"] + return [ + Document(page_content=r[self.page_content_key], metadata=r) for r in ret + ] + + async def _aget_relevant_documents( + self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun + ) -> List[Document]: + raise NotImplementedError diff --git a/libs/experimental/langchain_experimental/sql/prompt.py b/libs/experimental/langchain_experimental/sql/prompt.py new file mode 100644 index 0000000000..5f4c9b8a4f --- /dev/null +++ b/libs/experimental/langchain_experimental/sql/prompt.py @@ -0,0 +1,85 @@ +# flake8: noqa +from langchain.prompts.prompt import PromptTemplate + + +PROMPT_SUFFIX = """Only use the following tables: +{table_info} + +Question: {input}""" + +_VECTOR_SQL_DEFAULT_TEMPLATE = """You are a {dialect} expert. Given an input question, first create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer to the input question. +{dialect} queries has a vector distance function called `DISTANCE(column, array)` to compute relevance to the user's question and sort the feature array column by the relevance. +When the query is asking for {top_k} closest row, you have to use this distance function to calculate distance to entity's array on vector column and order by the distance to retrieve relevant rows. + +*NOTICE*: `DISTANCE(column, array)` only accept an array column as its first argument and a `NeuralArray(entity)` as its second argument. You also need a user defined function called `NeuralArray(entity)` to retrieve the entity's array. + +Unless the user specifies in the question a specific number of examples to obtain, query for at most {top_k} results using the LIMIT clause as per {dialect}. You should only order according to the distance function. +Never query for all columns from a table. You must query only the columns that are needed to answer the question. Wrap each column name in double quotes (") to denote them as delimited identifiers. +Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table. +Pay attention to use today() function to get the current date, if the question involves "today". `ORDER BY` clause should always be after `WHERE` clause. DO NOT add semicolon to the end of SQL. Pay attention to the comment in table schema. + +Use the following format: + +Question: "Question here" +SQLQuery: "SQL Query to run" +SQLResult: "Result of the SQLQuery" +Answer: "Final answer here" +""" + +VECTOR_SQL_PROMPT = PromptTemplate( + input_variables=["input", "table_info", "dialect", "top_k"], + template=_VECTOR_SQL_DEFAULT_TEMPLATE + PROMPT_SUFFIX, +) + + +_myscale_prompt = """You are a MyScale expert. Given an input question, first create a syntactically correct MyScale query to run, then look at the results of the query and return the answer to the input question. +MyScale queries has a vector distance function called `DISTANCE(column, array)` to compute relevance to the user's question and sort the feature array column by the relevance. +When the query is asking for {top_k} closest row, you have to use this distance function to calculate distance to entity's array on vector column and order by the distance to retrieve relevant rows. + +*NOTICE*: `DISTANCE(column, array)` only accept an array column as its first argument and a `NeuralArray(entity)` as its second argument. You also need a user defined function called `NeuralArray(entity)` to retrieve the entity's array. + +Unless the user specifies in the question a specific number of examples to obtain, query for at most {top_k} results using the LIMIT clause as per MyScale. You should only order according to the distance function. +Never query for all columns from a table. You must query only the columns that are needed to answer the question. Wrap each column name in double quotes (") to denote them as delimited identifiers. +Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table. +Pay attention to use today() function to get the current date, if the question involves "today". `ORDER BY` clause should always be after `WHERE` clause. DO NOT add semicolon to the end of SQL. Pay attention to the comment in table schema. + +Use the following format: + +======== table info ======== + + +Question: "Question here" +SQLQuery: "SQL Query to run" + + +Here are some examples: + +======== table info ======== +CREATE TABLE "ChatPaper" ( + abstract String, + id String, + vector Array(Float32), +) ENGINE = ReplicatedReplacingMergeTree() + ORDER BY id + PRIMARY KEY id + +Question: What is Feartue Pyramid Network? +SQLQuery: SELECT ChatPaper.title, ChatPaper.id, ChatPaper.authors FROM ChatPaper ORDER BY DISTANCE(vector, NeuralArray(PaperRank contribution)) LIMIT {top_k} + + +Let's begin: +======== table info ======== +{table_info} + +Question: {input} +SQLQuery: """ + +MYSCALE_PROMPT = PromptTemplate( + input_variables=["input", "table_info", "top_k"], + template=_myscale_prompt + PROMPT_SUFFIX, +) + + +VECTOR_SQL_PROMPTS = { + "myscale": MYSCALE_PROMPT, +} diff --git a/libs/experimental/langchain_experimental/sql/vector_sql.py b/libs/experimental/langchain_experimental/sql/vector_sql.py new file mode 100644 index 0000000000..98f3c2dee0 --- /dev/null +++ b/libs/experimental/langchain_experimental/sql/vector_sql.py @@ -0,0 +1,237 @@ +"""Vector SQL Database Chain Retriever""" +from __future__ import annotations + +from typing import Any, Dict, List, Optional, Union + +from langchain.callbacks.manager import CallbackManagerForChainRun +from langchain.chains.llm import LLMChain +from langchain.chains.sql_database.prompt import PROMPT, SQL_PROMPTS +from langchain.embeddings.base import Embeddings +from langchain.prompts.prompt import PromptTemplate +from langchain.schema import BaseOutputParser, BasePromptTemplate +from langchain.schema.language_model import BaseLanguageModel +from langchain.tools.sql_database.prompt import QUERY_CHECKER +from langchain.utilities.sql_database import SQLDatabase + +from langchain_experimental.sql.base import INTERMEDIATE_STEPS_KEY, SQLDatabaseChain + + +class VectorSQLOutputParser(BaseOutputParser[str]): + """Output Parser for Vector SQL + 1. finds for `NeuralArray()` and replace it with the embedding + 2. finds for `DISTANCE()` and replace it with the distance name in backend SQL + """ + + model: Embeddings + """Embedding model to extract embedding for entity""" + distance_func_name: str = "distance" + """Distance name for Vector SQL""" + + class Config: + arbitrary_types_allowed = 1 + + @property + def _type(self) -> str: + return "vector_sql_parser" + + @classmethod + def from_embeddings( + cls, model: Embeddings, distance_func_name: str = "distance", **kwargs: Any + ) -> BaseOutputParser: + return cls(model=model, distance_func_name=distance_func_name, **kwargs) + + def parse(self, text: str) -> str: + text = text.strip() + start = text.find("NeuralArray(") + _sql_str_compl = text + if start > 0: + _matched = text[text.find("NeuralArray(") + len("NeuralArray(") :] + end = _matched.find(")") + start + len("NeuralArray(") + 1 + entity = _matched[: _matched.find(")")] + vecs = self.model.embed_query(entity) + vecs_str = "[" + ",".join(map(str, vecs)) + "]" + _sql_str_compl = text.replace("DISTANCE", self.distance_func_name).replace( + text[start:end], vecs_str + ) + if _sql_str_compl[-1] == ";": + _sql_str_compl = _sql_str_compl[:-1] + return _sql_str_compl + + +class VectorSQLRetrieveAllOutputParser(VectorSQLOutputParser): + """Based on VectorSQLOutputParser + It also modify the SQL to get all columns + """ + + @property + def _type(self) -> str: + return "vector_sql_retrieve_all_parser" + + def parse(self, text: str) -> str: + text = text.strip() + start = text.upper().find("SELECT") + if start >= 0: + end = text.upper().find("FROM") + text = text.replace(text[start + len("SELECT") + 1 : end - 1], "*") + return super().parse(text) + + +def _try_eval(x: Any) -> Any: + try: + return eval(x) + except Exception: + return x + + +def get_result_from_sqldb( + db: SQLDatabase, cmd: str +) -> Union[str, List[Dict[str, Any]], Dict[str, Any]]: + result = db._execute(cmd, fetch="all") # type: ignore + if isinstance(result, list): + return [{k: _try_eval(v) for k, v in dict(d._asdict()).items()} for d in result] + else: + return { + k: _try_eval(v) for k, v in dict(result._asdict()).items() # type: ignore + } + + +class VectorSQLDatabaseChain(SQLDatabaseChain): + """Chain for interacting with Vector SQL Database. + + Example: + .. code-block:: python + + from langchain_experimental.sql import SQLDatabaseChain + from langchain import OpenAI, SQLDatabase, OpenAIEmbeddings + db = SQLDatabase(...) + db_chain = VectorSQLDatabaseChain.from_llm(OpenAI(), db, OpenAIEmbeddings()) + + *Security note*: Make sure that the database connection uses credentials + that are narrowly-scoped to only include the permissions this chain needs. + Failure to do so may result in data corruption or loss, since this chain may + attempt commands like `DROP TABLE` or `INSERT` if appropriately prompted. + The best way to guard against such negative outcomes is to (as appropriate) + limit the permissions granted to the credentials used with this chain. + This issue shows an example negative outcome if these steps are not taken: + https://github.com/langchain-ai/langchain/issues/5923 + """ + + sql_cmd_parser: VectorSQLOutputParser + """Parser for Vector SQL""" + native_format: bool = False + """If return_direct, controls whether to return in python native format""" + + def _call( + self, + inputs: Dict[str, Any], + run_manager: Optional[CallbackManagerForChainRun] = None, + ) -> Dict[str, Any]: + _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager() + input_text = f"{inputs[self.input_key]}\nSQLQuery:" + _run_manager.on_text(input_text, verbose=self.verbose) + # If not present, then defaults to None which is all tables. + table_names_to_use = inputs.get("table_names_to_use") + table_info = self.database.get_table_info(table_names=table_names_to_use) + llm_inputs = { + "input": input_text, + "top_k": str(self.top_k), + "dialect": self.database.dialect, + "table_info": table_info, + "stop": ["\nSQLResult:"], + } + intermediate_steps: List = [] + try: + intermediate_steps.append(llm_inputs) # input: sql generation + llm_out = self.llm_chain.predict( + callbacks=_run_manager.get_child(), + **llm_inputs, + ) + sql_cmd = self.sql_cmd_parser.parse(llm_out) + if self.return_sql: + return {self.output_key: sql_cmd} + if not self.use_query_checker: + _run_manager.on_text(llm_out, color="green", verbose=self.verbose) + intermediate_steps.append( + llm_out + ) # output: sql generation (no checker) + intermediate_steps.append({"sql_cmd": llm_out}) # input: sql exec + result = get_result_from_sqldb(self.database, sql_cmd) + intermediate_steps.append(str(result)) # output: sql exec + else: + query_checker_prompt = self.query_checker_prompt or PromptTemplate( + template=QUERY_CHECKER, input_variables=["query", "dialect"] + ) + query_checker_chain = LLMChain( + llm=self.llm_chain.llm, + prompt=query_checker_prompt, + output_parser=self.llm_chain.output_parser, + ) + query_checker_inputs = { + "query": llm_out, + "dialect": self.database.dialect, + } + checked_llm_out = query_checker_chain.predict( + callbacks=_run_manager.get_child(), **query_checker_inputs + ) + checked_sql_command = self.sql_cmd_parser.parse(checked_llm_out) + intermediate_steps.append( + checked_llm_out + ) # output: sql generation (checker) + _run_manager.on_text( + checked_llm_out, color="green", verbose=self.verbose + ) + intermediate_steps.append( + {"sql_cmd": checked_llm_out} + ) # input: sql exec + result = get_result_from_sqldb(self.database, checked_sql_command) + intermediate_steps.append(str(result)) # output: sql exec + llm_out = checked_llm_out + sql_cmd = checked_sql_command + + _run_manager.on_text("\nSQLResult: ", verbose=self.verbose) + _run_manager.on_text(str(result), color="yellow", verbose=self.verbose) + # If return direct, we just set the final result equal to + # the result of the sql query result, otherwise try to get a human readable + # final answer + if self.return_direct: + final_result = result + else: + _run_manager.on_text("\nAnswer:", verbose=self.verbose) + input_text += f"{llm_out}\nSQLResult: {result}\nAnswer:" + llm_inputs["input"] = input_text + intermediate_steps.append(llm_inputs) # input: final answer + final_result = self.llm_chain.predict( + callbacks=_run_manager.get_child(), + **llm_inputs, + ).strip() + intermediate_steps.append(final_result) # output: final answer + _run_manager.on_text(final_result, color="green", verbose=self.verbose) + chain_result: Dict[str, Any] = {self.output_key: final_result} + if self.return_intermediate_steps: + chain_result[INTERMEDIATE_STEPS_KEY] = intermediate_steps + return chain_result + except Exception as exc: + # Append intermediate steps to exception, to aid in logging and later + # improvement of few shot prompt seeds + exc.intermediate_steps = intermediate_steps # type: ignore + raise exc + + @property + def _chain_type(self) -> str: + return "vector_sql_database_chain" + + @classmethod + def from_llm( + cls, + llm: BaseLanguageModel, + db: SQLDatabase, + prompt: Optional[BasePromptTemplate] = None, + sql_cmd_parser: Optional[VectorSQLOutputParser] = None, + **kwargs: Any, + ) -> VectorSQLDatabaseChain: + assert sql_cmd_parser, "`sql_cmd_parser` must be set in VectorSQLDatabaseChain." + prompt = prompt or SQL_PROMPTS.get(db.dialect, PROMPT) + llm_chain = LLMChain(llm=llm, prompt=prompt) + return cls( + llm_chain=llm_chain, database=db, sql_cmd_parser=sql_cmd_parser, **kwargs + ) diff --git a/libs/experimental/poetry.lock b/libs/experimental/poetry.lock index fc8d7a61b7..9e8cf9f1af 100644 --- a/libs/experimental/poetry.lock +++ b/libs/experimental/poetry.lock @@ -124,24 +124,24 @@ frozenlist = ">=1.1.0" [[package]] name = "anyio" -version = "4.0.0" +version = "3.7.1" description = "High level compatibility layer for multiple asynchronous event loop implementations" optional = false -python-versions = ">=3.8" +python-versions = ">=3.7" files = [ - {file = "anyio-4.0.0-py3-none-any.whl", hash = "sha256:cfdb2b588b9fc25ede96d8db56ed50848b0b649dca3dd1df0b11f683bb9e0b5f"}, - {file = "anyio-4.0.0.tar.gz", hash = "sha256:f7ed51751b2c2add651e5747c891b47e26d2a21be5d32d9311dfe9692f3e5d7a"}, + {file = "anyio-3.7.1-py3-none-any.whl", hash = "sha256:91dee416e570e92c64041bd18b900d1d6fa78dff7048769ce5ac5ddad004fbb5"}, + {file = "anyio-3.7.1.tar.gz", hash = "sha256:44a3c9aba0f5defa43261a8b3efb97891f2bd7d804e0e1f56419befa1adfc780"}, ] [package.dependencies] -exceptiongroup = {version = ">=1.0.2", markers = "python_version < \"3.11\""} +exceptiongroup = {version = "*", markers = "python_version < \"3.11\""} idna = ">=2.8" sniffio = ">=1.1" [package.extras] -doc = ["Sphinx (>=7)", "packaging", "sphinx-autodoc-typehints (>=1.2.0)"] -test = ["anyio[trio]", "coverage[toml] (>=7)", "hypothesis (>=4.0)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] -trio = ["trio (>=0.22)"] +doc = ["Sphinx", "packaging", "sphinx-autodoc-typehints (>=1.2.0)", "sphinx-rtd-theme (>=1.2.2)", "sphinxcontrib-jquery"] +test = ["anyio[trio]", "coverage[toml] (>=4.5)", "hypothesis (>=4.0)", "mock (>=4)", "psutil (>=5.9)", "pytest (>=7.0)", "pytest-mock (>=3.6.1)", "trustme", "uvloop (>=0.17)"] +trio = ["trio (<0.22)"] [[package]] name = "appnope" @@ -227,17 +227,17 @@ python-dateutil = ">=2.7.0" [[package]] name = "asttokens" -version = "2.4.0" +version = "2.2.1" description = "Annotate AST trees with source code positions" optional = false python-versions = "*" files = [ - {file = "asttokens-2.4.0-py2.py3-none-any.whl", hash = "sha256:cf8fc9e61a86461aa9fb161a14a0841a03c405fa829ac6b202670b3495d2ce69"}, - {file = "asttokens-2.4.0.tar.gz", hash = "sha256:2e0171b991b2c959acc6c49318049236844a5da1d65ba2672c4880c1c894834e"}, + {file = "asttokens-2.2.1-py2.py3-none-any.whl", hash = "sha256:6b0ac9e93fb0335014d382b8fa9b3afa7df546984258005da0b9e7095b3deb1c"}, + {file = "asttokens-2.2.1.tar.gz", hash = "sha256:4622110b2a6f30b77e1473affaa97e711bc2f07d3f10848420ff1898edbe94f3"}, ] [package.dependencies] -six = ">=1.12.0" +six = "*" [package.extras] test = ["astroid", "pytest"] @@ -734,33 +734,29 @@ dev = ["flake8", "hypothesis", "ipython", "mypy (>=0.710)", "portray", "pytest ( [[package]] name = "debugpy" -version = "1.7.0" +version = "1.6.7.post1" description = "An implementation of the Debug Adapter Protocol for Python" optional = false python-versions = ">=3.7" files = [ - {file = "debugpy-1.7.0-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:17ad9a681aca1704c55b9a5edcb495fa8f599e4655c9872b7f9cf3dc25890d48"}, - {file = "debugpy-1.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1285920a3f9a75f5d1acf59ab1b9da9ae6eb9a05884cd7674f95170c9cafa4de"}, - {file = "debugpy-1.7.0-cp310-cp310-win32.whl", hash = "sha256:a6f43a681c5025db1f1c0568069d1d1bad306a02e7c36144912b26d9c90e4724"}, - {file = "debugpy-1.7.0-cp310-cp310-win_amd64.whl", hash = "sha256:9e9571d831ad3c75b5fb6f3efcb71c471cf2a74ba84af6ac1c79ce00683bed4b"}, - {file = "debugpy-1.7.0-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:538765a41198aa88cc089295b39c7322dd598f9ef1d52eaae12145c63bf9430a"}, - {file = "debugpy-1.7.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c7e8cf91f8f3f9b5fad844dd88427b85d398bda1e2a0cd65d5a21312fcbc0c6f"}, - {file = "debugpy-1.7.0-cp311-cp311-win32.whl", hash = "sha256:18a69f8e142a716310dd0af6d7db08992aed99e2606108732efde101e7c65e2a"}, - {file = "debugpy-1.7.0-cp311-cp311-win_amd64.whl", hash = "sha256:7515a5ba5ee9bfe956685909c5f28734c1cecd4ee813523363acfe3ca824883a"}, - {file = "debugpy-1.7.0-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:bc8da67ade39d9e75608cdb8601d07e63a4e85966e0572c981f14e2cf42bcdef"}, - {file = "debugpy-1.7.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a5036e918c6ba8fc4c4f1fd0207d81db634431a02f0dc2ba51b12fd793c8c9de"}, - {file = "debugpy-1.7.0-cp37-cp37m-win32.whl", hash = "sha256:d5be95b3946a4d7b388e45068c7b75036ac5a610f41014aee6cafcd5506423ad"}, - {file = "debugpy-1.7.0-cp37-cp37m-win_amd64.whl", hash = "sha256:0e90314a078d4e3f009520c8387aba8f74c3034645daa7a332a3d1bb81335756"}, - {file = "debugpy-1.7.0-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:1565fd904f9571c430adca597771255cff4f92171486fced6f765dcbdfc8ec8d"}, - {file = "debugpy-1.7.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6516f36a2e95b3be27f171f12b641e443863f4ad5255d0fdcea6ae0be29bb912"}, - {file = "debugpy-1.7.0-cp38-cp38-win32.whl", hash = "sha256:2b0e489613bc066051439df04c56777ec184b957d6810cb65f235083aef7a0dc"}, - {file = "debugpy-1.7.0-cp38-cp38-win_amd64.whl", hash = "sha256:7bf0b4bbd841b2397b6a8de15da9227f1164f6d43ceee971c50194eaed930a9d"}, - {file = "debugpy-1.7.0-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:ad22e1095b9977af432465c1e09132ba176e18df3834b1efcab1a449346b350b"}, - {file = "debugpy-1.7.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f625e427f21423e5874139db529e18cb2966bdfcc1cb87a195538c5b34d163d1"}, - {file = "debugpy-1.7.0-cp39-cp39-win32.whl", hash = "sha256:18bca8429d6632e2d3435055416d2d88f0309cc39709f4f6355c8d412cc61f24"}, - {file = "debugpy-1.7.0-cp39-cp39-win_amd64.whl", hash = "sha256:dc8a12ac8b97ef3d6973c6679a093138c7c9b03eb685f0e253269a195f651559"}, - {file = "debugpy-1.7.0-py2.py3-none-any.whl", hash = "sha256:f6de2e6f24f62969e0f0ef682d78c98161c4dca29e9fb05df4d2989005005502"}, - {file = "debugpy-1.7.0.zip", hash = "sha256:676911c710e85567b17172db934a71319ed9d995104610ce23fd74a07f66e6f6"}, + {file = "debugpy-1.6.7.post1-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:903bd61d5eb433b6c25b48eae5e23821d4c1a19e25c9610205f5aeaccae64e32"}, + {file = "debugpy-1.6.7.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d16882030860081e7dd5aa619f30dec3c2f9a421e69861125f83cc372c94e57d"}, + {file = "debugpy-1.6.7.post1-cp310-cp310-win32.whl", hash = "sha256:eea8d8cfb9965ac41b99a61f8e755a8f50e9a20330938ad8271530210f54e09c"}, + {file = "debugpy-1.6.7.post1-cp310-cp310-win_amd64.whl", hash = "sha256:85969d864c45f70c3996067cfa76a319bae749b04171f2cdeceebe4add316155"}, + {file = "debugpy-1.6.7.post1-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:890f7ab9a683886a0f185786ffbda3b46495c4b929dab083b8c79d6825832a52"}, + {file = "debugpy-1.6.7.post1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d4ac7a4dba28801d184b7fc0e024da2635ca87d8b0a825c6087bb5168e3c0d28"}, + {file = "debugpy-1.6.7.post1-cp37-cp37m-win32.whl", hash = "sha256:3370ef1b9951d15799ef7af41f8174194f3482ee689988379763ef61a5456426"}, + {file = "debugpy-1.6.7.post1-cp37-cp37m-win_amd64.whl", hash = "sha256:65b28435a17cba4c09e739621173ff90c515f7b9e8ea469b92e3c28ef8e5cdfb"}, + {file = "debugpy-1.6.7.post1-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:92b6dae8bfbd497c90596bbb69089acf7954164aea3228a99d7e43e5267f5b36"}, + {file = "debugpy-1.6.7.post1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:72f5d2ecead8125cf669e62784ef1e6300f4067b0f14d9f95ee00ae06fc7c4f7"}, + {file = "debugpy-1.6.7.post1-cp38-cp38-win32.whl", hash = "sha256:f0851403030f3975d6e2eaa4abf73232ab90b98f041e3c09ba33be2beda43fcf"}, + {file = "debugpy-1.6.7.post1-cp38-cp38-win_amd64.whl", hash = "sha256:3de5d0f97c425dc49bce4293df6a04494309eedadd2b52c22e58d95107e178d9"}, + {file = "debugpy-1.6.7.post1-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:38651c3639a4e8bbf0ca7e52d799f6abd07d622a193c406be375da4d510d968d"}, + {file = "debugpy-1.6.7.post1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:038c51268367c9c935905a90b1c2d2dbfe304037c27ba9d19fe7409f8cdc710c"}, + {file = "debugpy-1.6.7.post1-cp39-cp39-win32.whl", hash = "sha256:4b9eba71c290852f959d2cf8a03af28afd3ca639ad374d393d53d367f7f685b2"}, + {file = "debugpy-1.6.7.post1-cp39-cp39-win_amd64.whl", hash = "sha256:973a97ed3b434eab0f792719a484566c35328196540676685c975651266fccf9"}, + {file = "debugpy-1.6.7.post1-py2.py3-none-any.whl", hash = "sha256:1093a5c541af079c13ac8c70ab8b24d1d35c8cacb676306cf11e57f699c02926"}, + {file = "debugpy-1.6.7.post1.zip", hash = "sha256:fe87ec0182ef624855d05e6ed7e0b7cb1359d2ffa2a925f8ec2d22e98b75d0ca"}, ] [[package]] @@ -941,41 +937,6 @@ files = [ {file = "frozenlist-1.4.0.tar.gz", hash = "sha256:09163bdf0b2907454042edb19f887c6d33806adc71fbd54afc14908bfdc22251"}, ] -[[package]] -name = "fsspec" -version = "2023.9.0" -description = "File-system specification" -optional = true -python-versions = ">=3.8" -files = [ - {file = "fsspec-2023.9.0-py3-none-any.whl", hash = "sha256:d55b9ab2a4c1f2b759888ae9f93e40c2aa72c0808132e87e282b549f9e6c4254"}, - {file = "fsspec-2023.9.0.tar.gz", hash = "sha256:4dbf0fefee035b7c6d3bbbe6bc99b2f201f40d4dca95b67c2b719be77bcd917f"}, -] - -[package.extras] -abfs = ["adlfs"] -adl = ["adlfs"] -arrow = ["pyarrow (>=1)"] -dask = ["dask", "distributed"] -devel = ["pytest", "pytest-cov"] -dropbox = ["dropbox", "dropboxdrivefs", "requests"] -full = ["adlfs", "aiohttp (!=4.0.0a0,!=4.0.0a1)", "dask", "distributed", "dropbox", "dropboxdrivefs", "fusepy", "gcsfs", "libarchive-c", "ocifs", "panel", "paramiko", "pyarrow (>=1)", "pygit2", "requests", "s3fs", "smbprotocol", "tqdm"] -fuse = ["fusepy"] -gcs = ["gcsfs"] -git = ["pygit2"] -github = ["requests"] -gs = ["gcsfs"] -gui = ["panel"] -hdfs = ["pyarrow (>=1)"] -http = ["aiohttp (!=4.0.0a0,!=4.0.0a1)", "requests"] -libarchive = ["libarchive-c"] -oci = ["ocifs"] -s3 = ["s3fs"] -sftp = ["paramiko"] -smb = ["smbprotocol"] -ssh = ["paramiko"] -tqdm = ["tqdm"] - [[package]] name = "greenlet" version = "2.0.2" @@ -1049,39 +1010,6 @@ files = [ docs = ["Sphinx", "docutils (<0.18)"] test = ["objgraph", "psutil"] -[[package]] -name = "huggingface-hub" -version = "0.17.0" -description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" -optional = true -python-versions = ">=3.8.0" -files = [ - {file = "huggingface_hub-0.17.0-py3-none-any.whl", hash = "sha256:8111ef89ebf5514154b4e929662f57fc43818d06c95dabdfa4c77f9087383172"}, - {file = "huggingface_hub-0.17.0.tar.gz", hash = "sha256:a048c64e0f651c32afe41a1818bf2cd47de902ff65dfba395ff71b999d9d4655"}, -] - -[package.dependencies] -filelock = "*" -fsspec = "*" -packaging = ">=20.9" -pyyaml = ">=5.1" -requests = "*" -tqdm = ">=4.42.1" -typing-extensions = ">=3.7.4.3" - -[package.extras] -all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (==23.7)", "gradio", "jedi", "mypy (==1.5.1)", "numpy", "pydantic (<2.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"] -cli = ["InquirerPy (==0.3.4)"] -dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (==23.7)", "gradio", "jedi", "mypy (==1.5.1)", "numpy", "pydantic (<2.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)"] -docs = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "black (==23.7)", "gradio", "hf-doc-builder", "jedi", "mypy (==1.5.1)", "numpy", "pydantic (<2.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "ruff (>=0.0.241)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "urllib3 (<2.0)", "watchdog"] -fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"] -inference = ["aiohttp", "pydantic (<2.0)"] -quality = ["black (==23.7)", "mypy (==1.5.1)", "ruff (>=0.0.241)"] -tensorflow = ["graphviz", "pydot", "tensorflow"] -testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "gradio", "jedi", "numpy", "pydantic (<2.0)", "pytest", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] -torch = ["torch"] -typing = ["pydantic (<2.0)", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3"] - [[package]] name = "idna" version = "3.4" @@ -1143,13 +1071,13 @@ files = [ [[package]] name = "ipykernel" -version = "6.25.2" +version = "6.25.1" description = "IPython Kernel for Jupyter" optional = false python-versions = ">=3.8" files = [ - {file = "ipykernel-6.25.2-py3-none-any.whl", hash = "sha256:2e2ee359baba19f10251b99415bb39de1e97d04e1fab385646f24f0596510b77"}, - {file = "ipykernel-6.25.2.tar.gz", hash = "sha256:f468ddd1f17acb48c8ce67fcfa49ba6d46d4f9ac0438c1f441be7c3d1372230b"}, + {file = "ipykernel-6.25.1-py3-none-any.whl", hash = "sha256:c8a2430b357073b37c76c21c52184db42f6b4b0e438e1eb7df3c4440d120497c"}, + {file = "ipykernel-6.25.1.tar.gz", hash = "sha256:050391364c0977e768e354bdb60cbbfbee7cbb943b1af1618382021136ffd42f"}, ] [package.dependencies] @@ -1295,17 +1223,6 @@ MarkupSafe = ">=2.0" [package.extras] i18n = ["Babel (>=2.7)"] -[[package]] -name = "joblib" -version = "1.3.2" -description = "Lightweight pipelining with Python functions" -optional = true -python-versions = ">=3.7" -files = [ - {file = "joblib-1.3.2-py3-none-any.whl", hash = "sha256:ef4331c65f239985f3f2220ecc87db222f08fd22097a3dd5698f693875f8cbb9"}, - {file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"}, -] - [[package]] name = "json5" version = "0.9.14" @@ -1399,13 +1316,13 @@ qtconsole = "*" [[package]] name = "jupyter-client" -version = "8.3.1" +version = "8.3.0" description = "Jupyter protocol implementation and client libraries" optional = false python-versions = ">=3.8" files = [ - {file = "jupyter_client-8.3.1-py3-none-any.whl", hash = "sha256:5eb9f55eb0650e81de6b7e34308d8b92d04fe4ec41cd8193a913979e33d8e1a5"}, - {file = "jupyter_client-8.3.1.tar.gz", hash = "sha256:60294b2d5b869356c893f57b1a877ea6510d60d45cf4b38057f1672d85699ac9"}, + {file = "jupyter_client-8.3.0-py3-none-any.whl", hash = "sha256:7441af0c0672edc5d28035e92ba5e32fadcfa8a4e608a434c228836a89df6158"}, + {file = "jupyter_client-8.3.0.tar.gz", hash = "sha256:3af69921fe99617be1670399a0b857ad67275eefcfa291e2c81a160b7b650f5f"}, ] [package.dependencies] @@ -1506,13 +1423,13 @@ jupyter-server = ">=1.1.2" [[package]] name = "jupyter-server" -version = "2.7.3" +version = "2.7.2" description = "The backend—i.e. core services, APIs, and REST endpoints—to Jupyter web applications." optional = false python-versions = ">=3.8" files = [ - {file = "jupyter_server-2.7.3-py3-none-any.whl", hash = "sha256:8e4b90380b59d7a1e31086c4692231f2a2ea4cb269f5516e60aba72ce8317fc9"}, - {file = "jupyter_server-2.7.3.tar.gz", hash = "sha256:d4916c8581c4ebbc534cebdaa8eca2478d9f3bfdd88eae29fcab0120eac57649"}, + {file = "jupyter_server-2.7.2-py3-none-any.whl", hash = "sha256:98a375347b580e837e7016007c24680a4261ed8ad7cd35196ac087d229f48e5a"}, + {file = "jupyter_server-2.7.2.tar.gz", hash = "sha256:d64fb4e593907290e5df916e3c9399c15ab2cd7bdb71cbcd1d36452dbfb30523"}, ] [package.dependencies] @@ -1826,23 +1743,6 @@ files = [ {file = "mistune-3.0.1.tar.gz", hash = "sha256:e912116c13aa0944f9dc530db38eb88f6a77087ab128f49f84a48f4c05ea163c"}, ] -[[package]] -name = "mpmath" -version = "1.3.0" -description = "Python library for arbitrary-precision floating-point arithmetic" -optional = true -python-versions = "*" -files = [ - {file = "mpmath-1.3.0-py3-none-any.whl", hash = "sha256:a0b2b9fe80bbcd81a6647ff13108738cfb482d481d826cc0e02f5b35e5c88d2c"}, - {file = "mpmath-1.3.0.tar.gz", hash = "sha256:7a28eb2a9774d00c7bc92411c19a89209d5da7c4c9a9e227be8330a23a25b91f"}, -] - -[package.extras] -develop = ["codecov", "pycodestyle", "pytest (>=4.6)", "pytest-cov", "wheel"] -docs = ["sphinx"] -gmpy = ["gmpy2 (>=2.1.0a4)"] -tests = ["pytest (>=4.6)"] - [[package]] name = "multidict" version = "6.0.4" @@ -2048,13 +1948,13 @@ test = ["flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "nbconvert (>= [[package]] name = "nbconvert" -version = "7.8.0" +version = "7.7.4" description = "Converting Jupyter Notebooks" optional = false python-versions = ">=3.8" files = [ - {file = "nbconvert-7.8.0-py3-none-any.whl", hash = "sha256:aec605e051fa682ccc7934ccc338ba1e8b626cfadbab0db592106b630f63f0f2"}, - {file = "nbconvert-7.8.0.tar.gz", hash = "sha256:f5bc15a1247e14dd41ceef0c0a3bc70020e016576eb0578da62f1c5b4f950479"}, + {file = "nbconvert-7.7.4-py3-none-any.whl", hash = "sha256:ace26f4386d08eb5c55833596a942048c5502a95e05590cb523826a749a40a37"}, + {file = "nbconvert-7.7.4.tar.gz", hash = "sha256:1113d039fa3fc3a846ffa5a3b0a019e85aaa94c566a09fa0c400fb7638e46087"}, ] [package.dependencies] @@ -2116,61 +2016,19 @@ files = [ {file = "nest_asyncio-1.5.7.tar.gz", hash = "sha256:6a80f7b98f24d9083ed24608977c09dd608d83f91cccc24c9d2cba6d10e01c10"}, ] -[[package]] -name = "networkx" -version = "3.1" -description = "Python package for creating and manipulating graphs and networks" -optional = true -python-versions = ">=3.8" -files = [ - {file = "networkx-3.1-py3-none-any.whl", hash = "sha256:4f33f68cb2afcf86f28a45f43efc27a9386b535d567d2127f8f61d51dec58d36"}, - {file = "networkx-3.1.tar.gz", hash = "sha256:de346335408f84de0eada6ff9fafafff9bcda11f0a0dfaa931133debb146ab61"}, -] - -[package.extras] -default = ["matplotlib (>=3.4)", "numpy (>=1.20)", "pandas (>=1.3)", "scipy (>=1.8)"] -developer = ["mypy (>=1.1)", "pre-commit (>=3.2)"] -doc = ["nb2plots (>=0.6)", "numpydoc (>=1.5)", "pillow (>=9.4)", "pydata-sphinx-theme (>=0.13)", "sphinx (>=6.1)", "sphinx-gallery (>=0.12)", "texext (>=0.6.7)"] -extra = ["lxml (>=4.6)", "pydot (>=1.4.2)", "pygraphviz (>=1.10)", "sympy (>=1.10)"] -test = ["codecov (>=2.1)", "pytest (>=7.2)", "pytest-cov (>=4.0)"] - -[[package]] -name = "nltk" -version = "3.8.1" -description = "Natural Language Toolkit" -optional = true -python-versions = ">=3.7" -files = [ - {file = "nltk-3.8.1-py3-none-any.whl", hash = "sha256:fd5c9109f976fa86bcadba8f91e47f5e9293bd034474752e92a520f81c93dda5"}, - {file = "nltk-3.8.1.zip", hash = "sha256:1834da3d0682cba4f2cede2f9aad6b0fafb6461ba451db0efb6f9c39798d64d3"}, -] - -[package.dependencies] -click = "*" -joblib = "*" -regex = ">=2021.8.3" -tqdm = "*" - -[package.extras] -all = ["matplotlib", "numpy", "pyparsing", "python-crfsuite", "requests", "scikit-learn", "scipy", "twython"] -corenlp = ["requests"] -machine-learning = ["numpy", "python-crfsuite", "scikit-learn", "scipy"] -plot = ["matplotlib"] -tgrep = ["pyparsing"] -twitter = ["twython"] - [[package]] name = "notebook" -version = "7.0.3" +version = "7.0.2" description = "Jupyter Notebook - A web-based notebook environment for interactive computing" optional = false python-versions = ">=3.8" files = [ - {file = "notebook-7.0.3-py3-none-any.whl", hash = "sha256:786ab2e3287c068667adce3029b540dd18fc5d23f49181b4b4ee4f6b48a7ca81"}, - {file = "notebook-7.0.3.tar.gz", hash = "sha256:07f3c5062fd0e6e69864437a0347abc485d991aae87a92c47d659699f571b729"}, + {file = "notebook-7.0.2-py3-none-any.whl", hash = "sha256:c77b1499dc9b07ce4f4f26990dcb25b2107b434f2536766b51a72a4228d9a4b6"}, + {file = "notebook-7.0.2.tar.gz", hash = "sha256:d70d6a07418c829bd5f54337ce993b7105261d9026f9d3fe68e9b8aa1a20da9a"}, ] [package.dependencies] +importlib-resources = {version = ">=5.0", markers = "python_version < \"3.9\""} jupyter-server = ">=2.4.0,<3" jupyterlab = ">=4.0.2,<5" jupyterlab-server = ">=2.22.1,<3" @@ -2180,7 +2038,7 @@ tornado = ">=6.2.0" [package.extras] dev = ["hatch", "pre-commit"] docs = ["myst-parser", "nbsphinx", "pydata-sphinx-theme", "sphinx (>=1.3.6)", "sphinxcontrib-github-alt", "sphinxcontrib-spelling"] -test = ["importlib-resources (>=5.0)", "ipykernel", "jupyter-server[test] (>=2.4.0,<3)", "jupyterlab-server[test] (>=2.22.1,<3)", "nbval", "pytest (>=7.0)", "pytest-console-scripts", "pytest-timeout", "pytest-tornasync", "requests"] +test = ["ipykernel", "jupyter-server[test] (>=2.4.0,<3)", "jupyterlab-server[test] (>=2.22.1,<3)", "nbval", "pytest (>=7.0)", "pytest-console-scripts", "pytest-timeout", "pytest-tornasync", "requests"] [[package]] name = "notebook-shim" @@ -2314,73 +2172,6 @@ files = [ {file = "packaging-23.1.tar.gz", hash = "sha256:a392980d2b6cffa644431898be54b0045151319d1e7ec34f0cfed48767dd334f"}, ] -[[package]] -name = "pandas" -version = "2.0.3" -description = "Powerful data structures for data analysis, time series, and statistics" -optional = true -python-versions = ">=3.8" -files = [ - {file = "pandas-2.0.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4c7c9f27a4185304c7caf96dc7d91bc60bc162221152de697c98eb0b2648dd8"}, - {file = "pandas-2.0.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f167beed68918d62bffb6ec64f2e1d8a7d297a038f86d4aed056b9493fca407f"}, - {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce0c6f76a0f1ba361551f3e6dceaff06bde7514a374aa43e33b588ec10420183"}, - {file = "pandas-2.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba619e410a21d8c387a1ea6e8a0e49bb42216474436245718d7f2e88a2f8d7c0"}, - {file = "pandas-2.0.3-cp310-cp310-win32.whl", hash = "sha256:3ef285093b4fe5058eefd756100a367f27029913760773c8bf1d2d8bebe5d210"}, - {file = "pandas-2.0.3-cp310-cp310-win_amd64.whl", hash = "sha256:9ee1a69328d5c36c98d8e74db06f4ad518a1840e8ccb94a4ba86920986bb617e"}, - {file = "pandas-2.0.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b084b91d8d66ab19f5bb3256cbd5ea661848338301940e17f4492b2ce0801fe8"}, - {file = "pandas-2.0.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:37673e3bdf1551b95bf5d4ce372b37770f9529743d2498032439371fc7b7eb26"}, - {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9cb1e14fdb546396b7e1b923ffaeeac24e4cedd14266c3497216dd4448e4f2d"}, - {file = "pandas-2.0.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d9cd88488cceb7635aebb84809d087468eb33551097d600c6dad13602029c2df"}, - {file = "pandas-2.0.3-cp311-cp311-win32.whl", hash = "sha256:694888a81198786f0e164ee3a581df7d505024fbb1f15202fc7db88a71d84ebd"}, - {file = "pandas-2.0.3-cp311-cp311-win_amd64.whl", hash = "sha256:6a21ab5c89dcbd57f78d0ae16630b090eec626360085a4148693def5452d8a6b"}, - {file = "pandas-2.0.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:9e4da0d45e7f34c069fe4d522359df7d23badf83abc1d1cef398895822d11061"}, - {file = "pandas-2.0.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:32fca2ee1b0d93dd71d979726b12b61faa06aeb93cf77468776287f41ff8fdc5"}, - {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:258d3624b3ae734490e4d63c430256e716f488c4fcb7c8e9bde2d3aa46c29089"}, - {file = "pandas-2.0.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eae3dc34fa1aa7772dd3fc60270d13ced7346fcbcfee017d3132ec625e23bb0"}, - {file = "pandas-2.0.3-cp38-cp38-win32.whl", hash = "sha256:f3421a7afb1a43f7e38e82e844e2bca9a6d793d66c1a7f9f0ff39a795bbc5e02"}, - {file = "pandas-2.0.3-cp38-cp38-win_amd64.whl", hash = "sha256:69d7f3884c95da3a31ef82b7618af5710dba95bb885ffab339aad925c3e8ce78"}, - {file = "pandas-2.0.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5247fb1ba347c1261cbbf0fcfba4a3121fbb4029d95d9ef4dc45406620b25c8b"}, - {file = "pandas-2.0.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:81af086f4543c9d8bb128328b5d32e9986e0c84d3ee673a2ac6fb57fd14f755e"}, - {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1994c789bf12a7c5098277fb43836ce090f1073858c10f9220998ac74f37c69b"}, - {file = "pandas-2.0.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5ec591c48e29226bcbb316e0c1e9423622bc7a4eaf1ef7c3c9fa1a3981f89641"}, - {file = "pandas-2.0.3-cp39-cp39-win32.whl", hash = "sha256:04dbdbaf2e4d46ca8da896e1805bc04eb85caa9a82e259e8eed00254d5e0c682"}, - {file = "pandas-2.0.3-cp39-cp39-win_amd64.whl", hash = "sha256:1168574b036cd8b93abc746171c9b4f1b83467438a5e45909fed645cf8692dbc"}, - {file = "pandas-2.0.3.tar.gz", hash = "sha256:c02f372a88e0d17f36d3093a644c73cfc1788e876a7c4bcb4020a77512e2043c"}, -] - -[package.dependencies] -numpy = [ - {version = ">=1.20.3", markers = "python_version < \"3.10\""}, - {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, - {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, -] -python-dateutil = ">=2.8.2" -pytz = ">=2020.1" -tzdata = ">=2022.1" - -[package.extras] -all = ["PyQt5 (>=5.15.1)", "SQLAlchemy (>=1.4.16)", "beautifulsoup4 (>=4.9.3)", "bottleneck (>=1.3.2)", "brotlipy (>=0.7.0)", "fastparquet (>=0.6.3)", "fsspec (>=2021.07.0)", "gcsfs (>=2021.07.0)", "html5lib (>=1.1)", "hypothesis (>=6.34.2)", "jinja2 (>=3.0.0)", "lxml (>=4.6.3)", "matplotlib (>=3.6.1)", "numba (>=0.53.1)", "numexpr (>=2.7.3)", "odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pandas-gbq (>=0.15.0)", "psycopg2 (>=2.8.6)", "pyarrow (>=7.0.0)", "pymysql (>=1.0.2)", "pyreadstat (>=1.1.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)", "python-snappy (>=0.6.0)", "pyxlsb (>=1.0.8)", "qtpy (>=2.2.0)", "s3fs (>=2021.08.0)", "scipy (>=1.7.1)", "tables (>=3.6.1)", "tabulate (>=0.8.9)", "xarray (>=0.21.0)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)", "zstandard (>=0.15.2)"] -aws = ["s3fs (>=2021.08.0)"] -clipboard = ["PyQt5 (>=5.15.1)", "qtpy (>=2.2.0)"] -compression = ["brotlipy (>=0.7.0)", "python-snappy (>=0.6.0)", "zstandard (>=0.15.2)"] -computation = ["scipy (>=1.7.1)", "xarray (>=0.21.0)"] -excel = ["odfpy (>=1.4.1)", "openpyxl (>=3.0.7)", "pyxlsb (>=1.0.8)", "xlrd (>=2.0.1)", "xlsxwriter (>=1.4.3)"] -feather = ["pyarrow (>=7.0.0)"] -fss = ["fsspec (>=2021.07.0)"] -gcp = ["gcsfs (>=2021.07.0)", "pandas-gbq (>=0.15.0)"] -hdf5 = ["tables (>=3.6.1)"] -html = ["beautifulsoup4 (>=4.9.3)", "html5lib (>=1.1)", "lxml (>=4.6.3)"] -mysql = ["SQLAlchemy (>=1.4.16)", "pymysql (>=1.0.2)"] -output-formatting = ["jinja2 (>=3.0.0)", "tabulate (>=0.8.9)"] -parquet = ["pyarrow (>=7.0.0)"] -performance = ["bottleneck (>=1.3.2)", "numba (>=0.53.1)", "numexpr (>=2.7.1)"] -plot = ["matplotlib (>=3.6.1)"] -postgresql = ["SQLAlchemy (>=1.4.16)", "psycopg2 (>=2.8.6)"] -spss = ["pyreadstat (>=1.1.2)"] -sql-other = ["SQLAlchemy (>=1.4.16)"] -test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] -xml = ["lxml (>=4.6.3)"] - [[package]] name = "pandocfilters" version = "1.5.0" @@ -2476,75 +2267,6 @@ files = [ {file = "pickleshare-0.7.5.tar.gz", hash = "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca"}, ] -[[package]] -name = "pillow" -version = "10.0.0" -description = "Python Imaging Library (Fork)" -optional = true -python-versions = ">=3.8" -files = [ - {file = "Pillow-10.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1f62406a884ae75fb2f818694469519fb685cc7eaff05d3451a9ebe55c646891"}, - {file = "Pillow-10.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d5db32e2a6ccbb3d34d87c87b432959e0db29755727afb37290e10f6e8e62614"}, - {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edf4392b77bdc81f36e92d3a07a5cd072f90253197f4a52a55a8cec48a12483b"}, - {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:520f2a520dc040512699f20fa1c363eed506e94248d71f85412b625026f6142c"}, - {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:8c11160913e3dd06c8ffdb5f233a4f254cb449f4dfc0f8f4549eda9e542c93d1"}, - {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a74ba0c356aaa3bb8e3eb79606a87669e7ec6444be352870623025d75a14a2bf"}, - {file = "Pillow-10.0.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d5d0dae4cfd56969d23d94dc8e89fb6a217be461c69090768227beb8ed28c0a3"}, - {file = "Pillow-10.0.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:22c10cc517668d44b211717fd9775799ccec4124b9a7f7b3635fc5386e584992"}, - {file = "Pillow-10.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:dffe31a7f47b603318c609f378ebcd57f1554a3a6a8effbc59c3c69f804296de"}, - {file = "Pillow-10.0.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:9fb218c8a12e51d7ead2a7c9e101a04982237d4855716af2e9499306728fb485"}, - {file = "Pillow-10.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d35e3c8d9b1268cbf5d3670285feb3528f6680420eafe35cccc686b73c1e330f"}, - {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ed64f9ca2f0a95411e88a4efbd7a29e5ce2cea36072c53dd9d26d9c76f753b3"}, - {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b6eb5502f45a60a3f411c63187db83a3d3107887ad0d036c13ce836f8a36f1d"}, - {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:c1fbe7621c167ecaa38ad29643d77a9ce7311583761abf7836e1510c580bf3dd"}, - {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:cd25d2a9d2b36fcb318882481367956d2cf91329f6892fe5d385c346c0649629"}, - {file = "Pillow-10.0.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:3b08d4cc24f471b2c8ca24ec060abf4bebc6b144cb89cba638c720546b1cf538"}, - {file = "Pillow-10.0.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d737a602fbd82afd892ca746392401b634e278cb65d55c4b7a8f48e9ef8d008d"}, - {file = "Pillow-10.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:3a82c40d706d9aa9734289740ce26460a11aeec2d9c79b7af87bb35f0073c12f"}, - {file = "Pillow-10.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:bc2ec7c7b5d66b8ec9ce9f720dbb5fa4bace0f545acd34870eff4a369b44bf37"}, - {file = "Pillow-10.0.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:d80cf684b541685fccdd84c485b31ce73fc5c9b5d7523bf1394ce134a60c6883"}, - {file = "Pillow-10.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:76de421f9c326da8f43d690110f0e79fe3ad1e54be811545d7d91898b4c8493e"}, - {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:81ff539a12457809666fef6624684c008e00ff6bf455b4b89fd00a140eecd640"}, - {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce543ed15570eedbb85df19b0a1a7314a9c8141a36ce089c0a894adbfccb4568"}, - {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:685ac03cc4ed5ebc15ad5c23bc555d68a87777586d970c2c3e216619a5476223"}, - {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:d72e2ecc68a942e8cf9739619b7f408cc7b272b279b56b2c83c6123fcfa5cdff"}, - {file = "Pillow-10.0.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d50b6aec14bc737742ca96e85d6d0a5f9bfbded018264b3b70ff9d8c33485551"}, - {file = "Pillow-10.0.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:00e65f5e822decd501e374b0650146063fbb30a7264b4d2744bdd7b913e0cab5"}, - {file = "Pillow-10.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:f31f9fdbfecb042d046f9d91270a0ba28368a723302786c0009ee9b9f1f60199"}, - {file = "Pillow-10.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:1ce91b6ec08d866b14413d3f0bbdea7e24dfdc8e59f562bb77bc3fe60b6144ca"}, - {file = "Pillow-10.0.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:349930d6e9c685c089284b013478d6f76e3a534e36ddfa912cde493f235372f3"}, - {file = "Pillow-10.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3a684105f7c32488f7153905a4e3015a3b6c7182e106fe3c37fbb5ef3e6994c3"}, - {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b4f69b3700201b80bb82c3a97d5e9254084f6dd5fb5b16fc1a7b974260f89f43"}, - {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f07ea8d2f827d7d2a49ecf1639ec02d75ffd1b88dcc5b3a61bbb37a8759ad8d"}, - {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:040586f7d37b34547153fa383f7f9aed68b738992380ac911447bb78f2abe530"}, - {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f88a0b92277de8e3ca715a0d79d68dc82807457dae3ab8699c758f07c20b3c51"}, - {file = "Pillow-10.0.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:c7cf14a27b0d6adfaebb3ae4153f1e516df54e47e42dcc073d7b3d76111a8d86"}, - {file = "Pillow-10.0.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:3400aae60685b06bb96f99a21e1ada7bc7a413d5f49bce739828ecd9391bb8f7"}, - {file = "Pillow-10.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:dbc02381779d412145331789b40cc7b11fdf449e5d94f6bc0b080db0a56ea3f0"}, - {file = "Pillow-10.0.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:9211e7ad69d7c9401cfc0e23d49b69ca65ddd898976d660a2fa5904e3d7a9baa"}, - {file = "Pillow-10.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:faaf07ea35355b01a35cb442dd950d8f1bb5b040a7787791a535de13db15ed90"}, - {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9f72a021fbb792ce98306ffb0c348b3c9cb967dce0f12a49aa4c3d3fdefa967"}, - {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f7c16705f44e0504a3a2a14197c1f0b32a95731d251777dcb060aa83022cb2d"}, - {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:76edb0a1fa2b4745fb0c99fb9fb98f8b180a1bbceb8be49b087e0b21867e77d3"}, - {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:368ab3dfb5f49e312231b6f27b8820c823652b7cd29cfbd34090565a015e99ba"}, - {file = "Pillow-10.0.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:608bfdee0d57cf297d32bcbb3c728dc1da0907519d1784962c5f0c68bb93e5a3"}, - {file = "Pillow-10.0.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5c6e3df6bdd396749bafd45314871b3d0af81ff935b2d188385e970052091017"}, - {file = "Pillow-10.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:7be600823e4c8631b74e4a0d38384c73f680e6105a7d3c6824fcf226c178c7e6"}, - {file = "Pillow-10.0.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:92be919bbc9f7d09f7ae343c38f5bb21c973d2576c1d45600fce4b74bafa7ac0"}, - {file = "Pillow-10.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8182b523b2289f7c415f589118228d30ac8c355baa2f3194ced084dac2dbba"}, - {file = "Pillow-10.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:38250a349b6b390ee6047a62c086d3817ac69022c127f8a5dc058c31ccef17f3"}, - {file = "Pillow-10.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:88af2003543cc40c80f6fca01411892ec52b11021b3dc22ec3bc9d5afd1c5334"}, - {file = "Pillow-10.0.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:c189af0545965fa8d3b9613cfdb0cd37f9d71349e0f7750e1fd704648d475ed2"}, - {file = "Pillow-10.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce7b031a6fc11365970e6a5686d7ba8c63e4c1cf1ea143811acbb524295eabed"}, - {file = "Pillow-10.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:db24668940f82321e746773a4bc617bfac06ec831e5c88b643f91f122a785684"}, - {file = "Pillow-10.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:efe8c0681042536e0d06c11f48cebe759707c9e9abf880ee213541c5b46c5bf3"}, - {file = "Pillow-10.0.0.tar.gz", hash = "sha256:9c82b5b3e043c7af0d95792d0d20ccf68f61a1fec6b3530e718b688422727396"}, -] - -[package.extras] -docs = ["furo", "olefile", "sphinx (>=2.4)", "sphinx-copybutton", "sphinx-inline-tabs", "sphinx-removed-in", "sphinxext-opengraph"] -tests = ["check-manifest", "coverage", "defusedxml", "markdown2", "olefile", "packaging", "pyroma", "pytest", "pytest-cov", "pytest-timeout"] - [[package]] name = "pkgutil-resolve-name" version = "1.3.10" @@ -3108,13 +2830,13 @@ cffi = {version = "*", markers = "implementation_name == \"pypy\""} [[package]] name = "qtconsole" -version = "5.4.4" +version = "5.4.3" description = "Jupyter Qt console" optional = false python-versions = ">= 3.7" files = [ - {file = "qtconsole-5.4.4-py3-none-any.whl", hash = "sha256:a3b69b868e041c2c698bdc75b0602f42e130ffb256d6efa48f9aa756c97672aa"}, - {file = "qtconsole-5.4.4.tar.gz", hash = "sha256:b7ffb53d74f23cee29f4cdb55dd6fabc8ec312d94f3c46ba38e1dde458693dfb"}, + {file = "qtconsole-5.4.3-py3-none-any.whl", hash = "sha256:35fd6e87b1f6d1fd41801b07e69339f8982e76afd4fa8ef35595bc6036717189"}, + {file = "qtconsole-5.4.3.tar.gz", hash = "sha256:5e4082a86a201796b2a5cfd4298352d22b158b51b57736531824715fc2a979dd"}, ] [package.dependencies] @@ -3125,7 +2847,7 @@ jupyter-core = "*" packaging = "*" pygments = "*" pyzmq = ">=17.1" -qtpy = ">=2.4.0" +qtpy = ">=2.0.1" traitlets = "<5.2.1 || >5.2.1,<5.2.2 || >5.2.2" [package.extras] @@ -3134,13 +2856,13 @@ test = ["flaky", "pytest", "pytest-qt"] [[package]] name = "qtpy" -version = "2.4.0" +version = "2.3.1" description = "Provides an abstraction layer on top of the various Qt bindings (PyQt5/6 and PySide2/6)." optional = false python-versions = ">=3.7" files = [ - {file = "QtPy-2.4.0-py3-none-any.whl", hash = "sha256:4d4f045a41e09ac9fa57fcb47ef05781aa5af294a0a646acc1b729d14225e741"}, - {file = "QtPy-2.4.0.tar.gz", hash = "sha256:db2d508167aa6106781565c8da5c6f1487debacba33519cedc35fa8997d424d4"}, + {file = "QtPy-2.3.1-py3-none-any.whl", hash = "sha256:5193d20e0b16e4d9d3bc2c642d04d9f4e2c892590bd1b9c92bfe38a95d5a2e12"}, + {file = "QtPy-2.3.1.tar.gz", hash = "sha256:a8c74982d6d172ce124d80cafd39653df78989683f760f2281ba91a6e7b9de8b"}, ] [package.dependencies] @@ -3324,108 +3046,108 @@ files = [ [[package]] name = "rpds-py" -version = "0.10.2" +version = "0.9.2" description = "Python bindings to Rust's persistent data structures (rpds)" optional = false python-versions = ">=3.8" files = [ - {file = "rpds_py-0.10.2-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:9f00d54b18dd837f1431d66b076737deb7c29ce3ebb8412ceaf44d5e1954ac0c"}, - {file = "rpds_py-0.10.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8f4d561f4728f825e3b793a53064b606ca0b6fc264f67d09e54af452aafc5b82"}, - {file = "rpds_py-0.10.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:013d6c784150d10236a74b4094a79d96a256b814457e388fc5a4ba9efe24c402"}, - {file = "rpds_py-0.10.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:bd1142d22fdb183a0fff66d79134bf644401437fed874f81066d314c67ee193c"}, - {file = "rpds_py-0.10.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4a0536ed2b9297c75104e1a3da330828ba1b2639fa53b38d396f98bf7e3c68df"}, - {file = "rpds_py-0.10.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:41bd430b7b63aa802c02964e331ac0b177148fef5f807d2c90d05ce71a52b4d4"}, - {file = "rpds_py-0.10.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e8474f7233fe1949ce4e03bea698a600c2d5d6b51dab6d6e6336dbe69acf23e"}, - {file = "rpds_py-0.10.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:d9d7efaad48b859053b90dedd69bc92f2095084251e732e4c57ac9726bcb1e64"}, - {file = "rpds_py-0.10.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:5612b0b1de8d5114520094bd5fc3d04eb8af6f3e10d48ef05b7c8e77c1fd9545"}, - {file = "rpds_py-0.10.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5d5eaf988951f6ecb6854ca3300b87123599c711183c83da7ce39717a7cbdbce"}, - {file = "rpds_py-0.10.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:75c8766734ac0053e1d683567e65e85306c4ec62631b0591caeb287ac8f72e08"}, - {file = "rpds_py-0.10.2-cp310-none-win32.whl", hash = "sha256:8de9b88f0cbac73cfed34220d13c57849e62a7099a714b929142425e926d223a"}, - {file = "rpds_py-0.10.2-cp310-none-win_amd64.whl", hash = "sha256:2275f1a022e2383da5d2d101fe11ccdcbae799148c4b83260a4b9309fa3e1fc2"}, - {file = "rpds_py-0.10.2-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:dd91a7d7a9ce7f4983097c91ce211f3e5569cc21caa16f2692298a07e396f82b"}, - {file = "rpds_py-0.10.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e82b4a70cc67094f3f3fd77579702f48fcf1de7bdc67d79b8f1e24d089a6162c"}, - {file = "rpds_py-0.10.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e281b71922208e00886e4b7ffbfcf27874486364f177418ab676f102130e7ec9"}, - {file = "rpds_py-0.10.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b3eb1a0d2b6d232d1bcdfc3fcc5f7b004ab3fbd9203011a3172f051d4527c0b6"}, - {file = "rpds_py-0.10.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:02945ae38fd78efc40900f509890de84cfd5ffe2cd2939eeb3a8800dc68b87cb"}, - {file = "rpds_py-0.10.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ccfb77f6dc8abffa6f1c7e3975ed9070a41ce5fcc11154d2bead8c1baa940f09"}, - {file = "rpds_py-0.10.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:af52078719209bef33e38131486fd784832dd8d1dc9b85f00a44f6e7437dd021"}, - {file = "rpds_py-0.10.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:56ba7c1100ed079527f2b995bf5486a2e557e6d5b733c52e8947476338815b69"}, - {file = "rpds_py-0.10.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:899b03a3be785a7e1ff84b237da71f0efa2f021512f147dd34ffdf7aa82cb678"}, - {file = "rpds_py-0.10.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:22e6de18f00583f06928cc8d0993104ecc62f7c6da6478db2255de89a30e45d1"}, - {file = "rpds_py-0.10.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:edd74b760a6bb950397e7a7bd2f38e6700f6525062650b1d77c6d851b82f02c2"}, - {file = "rpds_py-0.10.2-cp311-none-win32.whl", hash = "sha256:18909093944727e068ebfc92e2e6ed1c4fa44135507c1c0555213ce211c53214"}, - {file = "rpds_py-0.10.2-cp311-none-win_amd64.whl", hash = "sha256:9568764e72d85cf7855ca78b48e07ed1be47bf230e2cea8dabda3c95f660b0ff"}, - {file = "rpds_py-0.10.2-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:0fc625059b83695fbb4fc8b7a8b66fa94ff9c7b78c84fb9986cd53ff88a28d80"}, - {file = "rpds_py-0.10.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c86231c66e4f422e7c13ea6200bb4048b3016c8bfd11b4fd0dabd04d2c8e3501"}, - {file = "rpds_py-0.10.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:56777c57246e048908b550af9b81b0ec9cf804fd47cb7502ccd93238bd6025c2"}, - {file = "rpds_py-0.10.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:a4cb372e22e9c879bd9a9cc9b20b7c1fbf30a605ac953da45ecec05d8a6e1c77"}, - {file = "rpds_py-0.10.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:aa3b3a43dabc4cc57a7800f526cbe03f71c69121e21b863fdf497b59b462b163"}, - {file = "rpds_py-0.10.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:59d222086daa55421d599609b32d0ebe544e57654c4a0a1490c54a7ebaa67561"}, - {file = "rpds_py-0.10.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:529aab727f54a937085184e7436e1d0e19975cf10115eda12d37a683e4ee5342"}, - {file = "rpds_py-0.10.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:43e9b1531d6a898bdf086acb75c41265c7ec4331267d7619148d407efc72bd24"}, - {file = "rpds_py-0.10.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c2772bb95062e3f9774140205cd65d8997e39620715486cf5f843cf4ad8f744c"}, - {file = "rpds_py-0.10.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:ba1b28e44f611f3f2b436bd8290050a61db4b59a8e24be4465f44897936b3824"}, - {file = "rpds_py-0.10.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5aba767e64b494483ad60c4873bec78d16205a21f8247c99749bd990d9c846c2"}, - {file = "rpds_py-0.10.2-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:e1954f4b239d1a92081647eecfd51cbfd08ea16eb743b8af1cd0113258feea14"}, - {file = "rpds_py-0.10.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:de4a2fd524993578fe093044f291b4b24aab134390030b3b9b5f87fd41ab7e75"}, - {file = "rpds_py-0.10.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e69737bd56006a86fd5a78b2b85447580a6138c930a75eb9ef39fe03d90782b1"}, - {file = "rpds_py-0.10.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:f40abbcc0a7d9a8a80870af839d317e6932533f98682aabd977add6c53beeb23"}, - {file = "rpds_py-0.10.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:29ec8507664f94cc08457d98cfc41c3cdbddfa8952438e644177a29b04937876"}, - {file = "rpds_py-0.10.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:bcde80aefe7054fad6277762fb7e9d35c72ea479a485ae1bb14629c640987b30"}, - {file = "rpds_py-0.10.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a65de5c02884760a14a58304fb6303f9ddfc582e630f385daea871e1bdb18686"}, - {file = "rpds_py-0.10.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:e92e5817eb6bfed23aa5e45bfe30647b83602bdd6f9e25d63524d4e6258458b0"}, - {file = "rpds_py-0.10.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:2c8fc6c841ada60a86d29c9ebe2e8757c47eda6553f3596c560e59ca6e9b6fa1"}, - {file = "rpds_py-0.10.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:8557c807388e6617161fe51b1a4747ea8d1133f2d2ad8e79583439abebe58fbd"}, - {file = "rpds_py-0.10.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:00e97d43a36811b78fa9ad9d3329bf34f76a31e891a7031a2ac01450c9b168ab"}, - {file = "rpds_py-0.10.2-cp38-none-win32.whl", hash = "sha256:1ed3d5385d14be894e12a9033be989e012214a9811e7194849c94032ad69682a"}, - {file = "rpds_py-0.10.2-cp38-none-win_amd64.whl", hash = "sha256:02b4a2e28eb24dac4ef43dda4f6a6f7766e355179b143f7d0c76a1c5488a307b"}, - {file = "rpds_py-0.10.2-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:2a55631b93e47956fbc97d69ba2054a8c6a4016f9a3064ec4e031f5f1030cb90"}, - {file = "rpds_py-0.10.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2ffbf1b38c88d0466de542e91b08225d51782282512f8e2b11715126c41fda48"}, - {file = "rpds_py-0.10.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:213f9ef5c02ec2f883c1075d25a873149daadbaea50d18d622e9db55ec9849c2"}, - {file = "rpds_py-0.10.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b00150a9a3fd0a8efaa90bc2696c105b04039d50763dd1c95a34c88c5966cb57"}, - {file = "rpds_py-0.10.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ab0f7aabdbce4a202e013083eeab71afdb85efa405dc4a06fea98cde81204675"}, - {file = "rpds_py-0.10.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2cd0c9fb5d40887500b4ed818770c68ab4fa6e0395d286f9704be6751b1b7d98"}, - {file = "rpds_py-0.10.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8578fc6c8bdd0201327503720fa581000b4bd3934abbf07e2628d1ad3de157d"}, - {file = "rpds_py-0.10.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:2d27d08056fcd61ff47a0cd8407eff4d3e816c82cb6b9c6f0ce9a0ad49225f81"}, - {file = "rpds_py-0.10.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:c8f6526df47953b07c45b95c4d1da6b9a0861c0e5da0271db96bb1d807825412"}, - {file = "rpds_py-0.10.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:177c033e467a66a054dd3a9534167234a3d0b2e41445807b13b626e01da25d92"}, - {file = "rpds_py-0.10.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:9c74cbee9e532dc34371127f7686d6953e5153a1f22beab7f953d95ee4a0fe09"}, - {file = "rpds_py-0.10.2-cp39-none-win32.whl", hash = "sha256:05a1382905026bdd560f806c8c7c16e0f3e3fb359ba8868203ca6e5799884968"}, - {file = "rpds_py-0.10.2-cp39-none-win_amd64.whl", hash = "sha256:3fd503c27e7b7034128e30847ecdb4bff4ca5e60f29ad022a9f66ae8940d54ac"}, - {file = "rpds_py-0.10.2-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:4a96147791e49e84207dd1530109aa0e9eeaf1c8b7a59f150047fc0fcdf9bb64"}, - {file = "rpds_py-0.10.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:203eb1532d51591d32e8dfafd60b5d31347ea7278c8da02b4b550287f6abe28b"}, - {file = "rpds_py-0.10.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a2f416cdfe92f5fbb77177f5f3f7830059d1582db05f2c7119bf80069d1ab69b"}, - {file = "rpds_py-0.10.2-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:b2660000e1a113869c86eb5cc07f3343467490f3cd9d0299f81da9ddae7137b7"}, - {file = "rpds_py-0.10.2-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1adb04e4b4e41bf30aaa77eeb169c1b9ba9e5010e2e6ce8d6c17e1446edc9b68"}, - {file = "rpds_py-0.10.2-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2bca97521ee786087f0c5ef318fef3eef0266a9c3deff88205523cf353af7394"}, - {file = "rpds_py-0.10.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4969592e3cdeefa4cbb15a26cec102cbd4a1d6e5b695fac9fa026e19741138c8"}, - {file = "rpds_py-0.10.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:df61f818edf7c8626bfa392f825860fb670b5f8336e238eb0ec7e2a5689cdded"}, - {file = "rpds_py-0.10.2-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:b589d93a60e78fe55d5bc76ee8c2bf945dbdbb7cd16044c53e0307604e448de1"}, - {file = "rpds_py-0.10.2-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:73da69e1f612c3e682e34dcb971272d90d6f27b2c99acff444ca455a89978574"}, - {file = "rpds_py-0.10.2-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:89438e8885a186c69fe31f7ef98bb2bf29688c466c3caf9060f404c0be89ae80"}, - {file = "rpds_py-0.10.2-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:c4ecc4e9a5d73a816cae36ee6b5d8b7a0c72013cae1e101406e832887c3dc2d8"}, - {file = "rpds_py-0.10.2-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:907b214da5d2fcff0b6ddb83de1333890ca92abaf4bbf8d9c61dc1b95c87fd6e"}, - {file = "rpds_py-0.10.2-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:bb44644371eaa29a3aba7b69b1862d0d56f073bb7585baa32e4271a71a91ee82"}, - {file = "rpds_py-0.10.2-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:80c3cf46511653f94dfe07c7c79ab105c4164d6e1dfcb35b7214fb9af53eaef4"}, - {file = "rpds_py-0.10.2-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:eaba0613c759ebf95988a84f766ca6b7432d55ce399194f95dde588ad1be0878"}, - {file = "rpds_py-0.10.2-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0527c97dcd8bb983822ee31d3760187083fd3ba18ac4dd22cf5347c89d5628f4"}, - {file = "rpds_py-0.10.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9cdfd649011ce2d90cb0dd304c5aba1190fac0c266d19a9e2b96b81cfd150a09"}, - {file = "rpds_py-0.10.2-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:75eea40355a8690459c7291ce6c8ce39c27bd223675c7da6619f510c728feb97"}, - {file = "rpds_py-0.10.2-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:4f1b804cfad04f862d6a84af9d1ad941b06f671878f0f7ecad6c92007d423de6"}, - {file = "rpds_py-0.10.2-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:bf77f9017fcfa1232f98598a637406e6c33982ccba8a5922339575c3e2b90ea5"}, - {file = "rpds_py-0.10.2-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:46c4c550bf59ce05d6bff2c98053822549aaf9fbaf81103edea325e03350bca1"}, - {file = "rpds_py-0.10.2-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:46af4a742b90c7460e94214f923452c2c1d050a9da1d2b8d4c70cbc045e692b7"}, - {file = "rpds_py-0.10.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:2a86d246a160d98d820ee7d02dc18c923c228de095be362e57b9fd8970b2c4a1"}, - {file = "rpds_py-0.10.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ae141c9017f8f473a6ee07a9425da021816a9f8c0683c2e5442f0ccf56b0fc62"}, - {file = "rpds_py-0.10.2-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:e1147bc3d0dd1e549d991110d0a09557ec9f925dbc1ca62871fcdab2ec9d716b"}, - {file = "rpds_py-0.10.2-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fce7a8ee8d0f682c953c0188735d823f0fcb62779bf92cd6ba473a8e730e26ad"}, - {file = "rpds_py-0.10.2-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4c7f9d70f99e1fbcbf57c75328b80e1c0a7f6cad43e75efa90a97221be5efe15"}, - {file = "rpds_py-0.10.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b309908b6ff5ffbf6394818cb73b5a2a74073acee2c57fe8719046389aeff0d"}, - {file = "rpds_py-0.10.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:3ff1f585a0fdc1415bd733b804f33d386064a308672249b14828130dd43e7c31"}, - {file = "rpds_py-0.10.2-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:0188b580c490bccb031e9b67e9e8c695a3c44ac5e06218b152361eca847317c3"}, - {file = "rpds_py-0.10.2-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:abe081453166e206e3a8c6d8ace57214c17b6d9477d7601ac14a365344dbc1f4"}, - {file = "rpds_py-0.10.2-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:9118de88c16947eaf5b92f749e65b0501ea69e7c2be7bd6aefc12551622360e1"}, - {file = "rpds_py-0.10.2.tar.gz", hash = "sha256:289073f68452b96e70990085324be7223944c7409973d13ddfe0eea1c1b5663b"}, + {file = "rpds_py-0.9.2-cp310-cp310-macosx_10_7_x86_64.whl", hash = "sha256:ab6919a09c055c9b092798ce18c6c4adf49d24d4d9e43a92b257e3f2548231e7"}, + {file = "rpds_py-0.9.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d55777a80f78dd09410bd84ff8c95ee05519f41113b2df90a69622f5540c4f8b"}, + {file = "rpds_py-0.9.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a216b26e5af0a8e265d4efd65d3bcec5fba6b26909014effe20cd302fd1138fa"}, + {file = "rpds_py-0.9.2-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:29cd8bfb2d716366a035913ced99188a79b623a3512292963d84d3e06e63b496"}, + {file = "rpds_py-0.9.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:44659b1f326214950a8204a248ca6199535e73a694be8d3e0e869f820767f12f"}, + {file = "rpds_py-0.9.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:745f5a43fdd7d6d25a53ab1a99979e7f8ea419dfefebcab0a5a1e9095490ee5e"}, + {file = "rpds_py-0.9.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a987578ac5214f18b99d1f2a3851cba5b09f4a689818a106c23dbad0dfeb760f"}, + {file = "rpds_py-0.9.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:bf4151acb541b6e895354f6ff9ac06995ad9e4175cbc6d30aaed08856558201f"}, + {file = "rpds_py-0.9.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:03421628f0dc10a4119d714a17f646e2837126a25ac7a256bdf7c3943400f67f"}, + {file = "rpds_py-0.9.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:13b602dc3e8dff3063734f02dcf05111e887f301fdda74151a93dbbc249930fe"}, + {file = "rpds_py-0.9.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:fae5cb554b604b3f9e2c608241b5d8d303e410d7dfb6d397c335f983495ce7f6"}, + {file = "rpds_py-0.9.2-cp310-none-win32.whl", hash = "sha256:47c5f58a8e0c2c920cc7783113df2fc4ff12bf3a411d985012f145e9242a2764"}, + {file = "rpds_py-0.9.2-cp310-none-win_amd64.whl", hash = "sha256:4ea6b73c22d8182dff91155af018b11aac9ff7eca085750455c5990cb1cfae6e"}, + {file = "rpds_py-0.9.2-cp311-cp311-macosx_10_7_x86_64.whl", hash = "sha256:e564d2238512c5ef5e9d79338ab77f1cbbda6c2d541ad41b2af445fb200385e3"}, + {file = "rpds_py-0.9.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f411330a6376fb50e5b7a3e66894e4a39e60ca2e17dce258d53768fea06a37bd"}, + {file = "rpds_py-0.9.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e7521f5af0233e89939ad626b15278c71b69dc1dfccaa7b97bd4cdf96536bb7"}, + {file = "rpds_py-0.9.2-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8d3335c03100a073883857e91db9f2e0ef8a1cf42dc0369cbb9151c149dbbc1b"}, + {file = "rpds_py-0.9.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d25b1c1096ef0447355f7293fbe9ad740f7c47ae032c2884113f8e87660d8f6e"}, + {file = "rpds_py-0.9.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a5d3fbd02efd9cf6a8ffc2f17b53a33542f6b154e88dd7b42ef4a4c0700fdad"}, + {file = "rpds_py-0.9.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5934e2833afeaf36bd1eadb57256239785f5af0220ed8d21c2896ec4d3a765f"}, + {file = "rpds_py-0.9.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:095b460e117685867d45548fbd8598a8d9999227e9061ee7f012d9d264e6048d"}, + {file = "rpds_py-0.9.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:91378d9f4151adc223d584489591dbb79f78814c0734a7c3bfa9c9e09978121c"}, + {file = "rpds_py-0.9.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:24a81c177379300220e907e9b864107614b144f6c2a15ed5c3450e19cf536fae"}, + {file = "rpds_py-0.9.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:de0b6eceb46141984671802d412568d22c6bacc9b230174f9e55fc72ef4f57de"}, + {file = "rpds_py-0.9.2-cp311-none-win32.whl", hash = "sha256:700375326ed641f3d9d32060a91513ad668bcb7e2cffb18415c399acb25de2ab"}, + {file = "rpds_py-0.9.2-cp311-none-win_amd64.whl", hash = "sha256:0766babfcf941db8607bdaf82569ec38107dbb03c7f0b72604a0b346b6eb3298"}, + {file = "rpds_py-0.9.2-cp312-cp312-macosx_10_7_x86_64.whl", hash = "sha256:b1440c291db3f98a914e1afd9d6541e8fc60b4c3aab1a9008d03da4651e67386"}, + {file = "rpds_py-0.9.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:0f2996fbac8e0b77fd67102becb9229986396e051f33dbceada3debaacc7033f"}, + {file = "rpds_py-0.9.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f30d205755566a25f2ae0382944fcae2f350500ae4df4e795efa9e850821d82"}, + {file = "rpds_py-0.9.2-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:159fba751a1e6b1c69244e23ba6c28f879a8758a3e992ed056d86d74a194a0f3"}, + {file = "rpds_py-0.9.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a1f044792e1adcea82468a72310c66a7f08728d72a244730d14880cd1dabe36b"}, + {file = "rpds_py-0.9.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9251eb8aa82e6cf88510530b29eef4fac825a2b709baf5b94a6094894f252387"}, + {file = "rpds_py-0.9.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:01899794b654e616c8625b194ddd1e5b51ef5b60ed61baa7a2d9c2ad7b2a4238"}, + {file = "rpds_py-0.9.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:b0c43f8ae8f6be1d605b0465671124aa8d6a0e40f1fb81dcea28b7e3d87ca1e1"}, + {file = "rpds_py-0.9.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:207f57c402d1f8712618f737356e4b6f35253b6d20a324d9a47cb9f38ee43a6b"}, + {file = "rpds_py-0.9.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:b52e7c5ae35b00566d244ffefba0f46bb6bec749a50412acf42b1c3f402e2c90"}, + {file = "rpds_py-0.9.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:978fa96dbb005d599ec4fd9ed301b1cc45f1a8f7982d4793faf20b404b56677d"}, + {file = "rpds_py-0.9.2-cp38-cp38-macosx_10_7_x86_64.whl", hash = "sha256:6aa8326a4a608e1c28da191edd7c924dff445251b94653988efb059b16577a4d"}, + {file = "rpds_py-0.9.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:aad51239bee6bff6823bbbdc8ad85136c6125542bbc609e035ab98ca1e32a192"}, + {file = "rpds_py-0.9.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4bd4dc3602370679c2dfb818d9c97b1137d4dd412230cfecd3c66a1bf388a196"}, + {file = "rpds_py-0.9.2-cp38-cp38-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:dd9da77c6ec1f258387957b754f0df60766ac23ed698b61941ba9acccd3284d1"}, + {file = "rpds_py-0.9.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:190ca6f55042ea4649ed19c9093a9be9d63cd8a97880106747d7147f88a49d18"}, + {file = "rpds_py-0.9.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:876bf9ed62323bc7dcfc261dbc5572c996ef26fe6406b0ff985cbcf460fc8a4c"}, + {file = "rpds_py-0.9.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fa2818759aba55df50592ecbc95ebcdc99917fa7b55cc6796235b04193eb3c55"}, + {file = "rpds_py-0.9.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:9ea4d00850ef1e917815e59b078ecb338f6a8efda23369677c54a5825dbebb55"}, + {file = "rpds_py-0.9.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:5855c85eb8b8a968a74dc7fb014c9166a05e7e7a8377fb91d78512900aadd13d"}, + {file = "rpds_py-0.9.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:14c408e9d1a80dcb45c05a5149e5961aadb912fff42ca1dd9b68c0044904eb32"}, + {file = "rpds_py-0.9.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:65a0583c43d9f22cb2130c7b110e695fff834fd5e832a776a107197e59a1898e"}, + {file = "rpds_py-0.9.2-cp38-none-win32.whl", hash = "sha256:71f2f7715935a61fa3e4ae91d91b67e571aeb5cb5d10331ab681256bda2ad920"}, + {file = "rpds_py-0.9.2-cp38-none-win_amd64.whl", hash = "sha256:674c704605092e3ebbbd13687b09c9f78c362a4bc710343efe37a91457123044"}, + {file = "rpds_py-0.9.2-cp39-cp39-macosx_10_7_x86_64.whl", hash = "sha256:07e2c54bef6838fa44c48dfbc8234e8e2466d851124b551fc4e07a1cfeb37260"}, + {file = "rpds_py-0.9.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:f7fdf55283ad38c33e35e2855565361f4bf0abd02470b8ab28d499c663bc5d7c"}, + {file = "rpds_py-0.9.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:890ba852c16ace6ed9f90e8670f2c1c178d96510a21b06d2fa12d8783a905193"}, + {file = "rpds_py-0.9.2-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:50025635ba8b629a86d9d5474e650da304cb46bbb4d18690532dd79341467846"}, + {file = "rpds_py-0.9.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:517cbf6e67ae3623c5127206489d69eb2bdb27239a3c3cc559350ef52a3bbf0b"}, + {file = "rpds_py-0.9.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0836d71ca19071090d524739420a61580f3f894618d10b666cf3d9a1688355b1"}, + {file = "rpds_py-0.9.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c439fd54b2b9053717cca3de9583be6584b384d88d045f97d409f0ca867d80f"}, + {file = "rpds_py-0.9.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:f68996a3b3dc9335037f82754f9cdbe3a95db42bde571d8c3be26cc6245f2324"}, + {file = "rpds_py-0.9.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:7d68dc8acded354c972116f59b5eb2e5864432948e098c19fe6994926d8e15c3"}, + {file = "rpds_py-0.9.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:f963c6b1218b96db85fc37a9f0851eaf8b9040aa46dec112611697a7023da535"}, + {file = "rpds_py-0.9.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:5a46859d7f947061b4010e554ccd1791467d1b1759f2dc2ec9055fa239f1bc26"}, + {file = "rpds_py-0.9.2-cp39-none-win32.whl", hash = "sha256:e07e5dbf8a83c66783a9fe2d4566968ea8c161199680e8ad38d53e075df5f0d0"}, + {file = "rpds_py-0.9.2-cp39-none-win_amd64.whl", hash = "sha256:682726178138ea45a0766907957b60f3a1bf3acdf212436be9733f28b6c5af3c"}, + {file = "rpds_py-0.9.2-pp310-pypy310_pp73-macosx_10_7_x86_64.whl", hash = "sha256:196cb208825a8b9c8fc360dc0f87993b8b260038615230242bf18ec84447c08d"}, + {file = "rpds_py-0.9.2-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:c7671d45530fcb6d5e22fd40c97e1e1e01965fc298cbda523bb640f3d923b387"}, + {file = "rpds_py-0.9.2-pp310-pypy310_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83b32f0940adec65099f3b1c215ef7f1d025d13ff947975a055989cb7fd019a4"}, + {file = "rpds_py-0.9.2-pp310-pypy310_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:7f67da97f5b9eac838b6980fc6da268622e91f8960e083a34533ca710bec8611"}, + {file = "rpds_py-0.9.2-pp310-pypy310_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:03975db5f103997904c37e804e5f340c8fdabbb5883f26ee50a255d664eed58c"}, + {file = "rpds_py-0.9.2-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:987b06d1cdb28f88a42e4fb8a87f094e43f3c435ed8e486533aea0bf2e53d931"}, + {file = "rpds_py-0.9.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c861a7e4aef15ff91233751619ce3a3d2b9e5877e0fcd76f9ea4f6847183aa16"}, + {file = "rpds_py-0.9.2-pp310-pypy310_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:02938432352359805b6da099c9c95c8a0547fe4b274ce8f1a91677401bb9a45f"}, + {file = "rpds_py-0.9.2-pp310-pypy310_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:ef1f08f2a924837e112cba2953e15aacfccbbfcd773b4b9b4723f8f2ddded08e"}, + {file = "rpds_py-0.9.2-pp310-pypy310_pp73-musllinux_1_2_i686.whl", hash = "sha256:35da5cc5cb37c04c4ee03128ad59b8c3941a1e5cd398d78c37f716f32a9b7f67"}, + {file = "rpds_py-0.9.2-pp310-pypy310_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:141acb9d4ccc04e704e5992d35472f78c35af047fa0cfae2923835d153f091be"}, + {file = "rpds_py-0.9.2-pp38-pypy38_pp73-macosx_10_7_x86_64.whl", hash = "sha256:79f594919d2c1a0cc17d1988a6adaf9a2f000d2e1048f71f298b056b1018e872"}, + {file = "rpds_py-0.9.2-pp38-pypy38_pp73-macosx_11_0_arm64.whl", hash = "sha256:a06418fe1155e72e16dddc68bb3780ae44cebb2912fbd8bb6ff9161de56e1798"}, + {file = "rpds_py-0.9.2-pp38-pypy38_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8b2eb034c94b0b96d5eddb290b7b5198460e2d5d0c421751713953a9c4e47d10"}, + {file = "rpds_py-0.9.2-pp38-pypy38_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:8b08605d248b974eb02f40bdcd1a35d3924c83a2a5e8f5d0fa5af852c4d960af"}, + {file = "rpds_py-0.9.2-pp38-pypy38_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a0805911caedfe2736935250be5008b261f10a729a303f676d3d5fea6900c96a"}, + {file = "rpds_py-0.9.2-pp38-pypy38_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:ab2299e3f92aa5417d5e16bb45bb4586171c1327568f638e8453c9f8d9e0f020"}, + {file = "rpds_py-0.9.2-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8c8d7594e38cf98d8a7df25b440f684b510cf4627fe038c297a87496d10a174f"}, + {file = "rpds_py-0.9.2-pp38-pypy38_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:8b9ec12ad5f0a4625db34db7e0005be2632c1013b253a4a60e8302ad4d462afd"}, + {file = "rpds_py-0.9.2-pp38-pypy38_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:1fcdee18fea97238ed17ab6478c66b2095e4ae7177e35fb71fbe561a27adf620"}, + {file = "rpds_py-0.9.2-pp38-pypy38_pp73-musllinux_1_2_i686.whl", hash = "sha256:933a7d5cd4b84f959aedeb84f2030f0a01d63ae6cf256629af3081cf3e3426e8"}, + {file = "rpds_py-0.9.2-pp38-pypy38_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:686ba516e02db6d6f8c279d1641f7067ebb5dc58b1d0536c4aaebb7bf01cdc5d"}, + {file = "rpds_py-0.9.2-pp39-pypy39_pp73-macosx_10_7_x86_64.whl", hash = "sha256:0173c0444bec0a3d7d848eaeca2d8bd32a1b43f3d3fde6617aac3731fa4be05f"}, + {file = "rpds_py-0.9.2-pp39-pypy39_pp73-macosx_11_0_arm64.whl", hash = "sha256:d576c3ef8c7b2d560e301eb33891d1944d965a4d7a2eacb6332eee8a71827db6"}, + {file = "rpds_py-0.9.2-pp39-pypy39_pp73-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ed89861ee8c8c47d6beb742a602f912b1bb64f598b1e2f3d758948721d44d468"}, + {file = "rpds_py-0.9.2-pp39-pypy39_pp73-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:1054a08e818f8e18910f1bee731583fe8f899b0a0a5044c6e680ceea34f93876"}, + {file = "rpds_py-0.9.2-pp39-pypy39_pp73-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:99e7c4bb27ff1aab90dcc3e9d37ee5af0231ed98d99cb6f5250de28889a3d502"}, + {file = "rpds_py-0.9.2-pp39-pypy39_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c545d9d14d47be716495076b659db179206e3fd997769bc01e2d550eeb685596"}, + {file = "rpds_py-0.9.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9039a11bca3c41be5a58282ed81ae422fa680409022b996032a43badef2a3752"}, + {file = "rpds_py-0.9.2-pp39-pypy39_pp73-manylinux_2_5_i686.manylinux1_i686.whl", hash = "sha256:fb39aca7a64ad0c9490adfa719dbeeb87d13be137ca189d2564e596f8ba32c07"}, + {file = "rpds_py-0.9.2-pp39-pypy39_pp73-musllinux_1_2_aarch64.whl", hash = "sha256:2d8b3b3a2ce0eaa00c5bbbb60b6713e94e7e0becab7b3db6c5c77f979e8ed1f1"}, + {file = "rpds_py-0.9.2-pp39-pypy39_pp73-musllinux_1_2_i686.whl", hash = "sha256:99b1c16f732b3a9971406fbfe18468592c5a3529585a45a35adbc1389a529a03"}, + {file = "rpds_py-0.9.2-pp39-pypy39_pp73-musllinux_1_2_x86_64.whl", hash = "sha256:c27ee01a6c3223025f4badd533bea5e87c988cb0ba2811b690395dfe16088cfe"}, + {file = "rpds_py-0.9.2.tar.gz", hash = "sha256:8d70e8f14900f2657c249ea4def963bed86a29b81f81f5b76b5a9215680de945"}, ] [[package]] @@ -3454,165 +3176,6 @@ files = [ {file = "ruff-0.0.249.tar.gz", hash = "sha256:b590689f08ecef971c45555cbda6854cdf48f3828fc326802828e851b1a14b3d"}, ] -[[package]] -name = "safetensors" -version = "0.3.3" -description = "Fast and Safe Tensor serialization" -optional = true -python-versions = "*" -files = [ - {file = "safetensors-0.3.3-cp310-cp310-macosx_10_11_x86_64.whl", hash = "sha256:92e4d0c8b2836120fddd134474c5bda8963f322333941f8b9f643e5b24f041eb"}, - {file = "safetensors-0.3.3-cp310-cp310-macosx_11_0_x86_64.whl", hash = "sha256:3dcadb6153c42addc9c625a622ebde9293fabe1973f9ef31ba10fb42c16e8536"}, - {file = "safetensors-0.3.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:08f26b61e1b0a14dc959aa9d568776bd038805f611caef1de04a80c468d4a7a4"}, - {file = "safetensors-0.3.3-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:17f41344d9a075f2f21b289a49a62e98baff54b5754240ba896063bce31626bf"}, - {file = "safetensors-0.3.3-cp310-cp310-macosx_13_0_arm64.whl", hash = "sha256:f1045f798e1a16a6ced98d6a42ec72936d367a2eec81dc5fade6ed54638cd7d2"}, - {file = "safetensors-0.3.3-cp310-cp310-macosx_13_0_x86_64.whl", hash = "sha256:eaf0e4bc91da13f21ac846a39429eb3f3b7ed06295a32321fa3eb1a59b5c70f3"}, - {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:25149180d4dc8ca48bac2ac3852a9424b466e36336a39659b35b21b2116f96fc"}, - {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9e943bf78c39de8865398a71818315e7d5d1af93c7b30d4da3fc852e62ad9bc"}, - {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:cccfcac04a010354e87c7a2fe16a1ff004fc4f6e7ef8efc966ed30122ce00bc7"}, - {file = "safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a07121f427e646a50d18c1be0fa1a2cbf6398624c31149cd7e6b35486d72189e"}, - {file = "safetensors-0.3.3-cp310-cp310-win32.whl", hash = "sha256:a85e29cbfddfea86453cc0f4889b4bcc6b9c155be9a60e27be479a34e199e7ef"}, - {file = "safetensors-0.3.3-cp310-cp310-win_amd64.whl", hash = "sha256:e13adad4a3e591378f71068d14e92343e626cf698ff805f61cdb946e684a218e"}, - {file = "safetensors-0.3.3-cp311-cp311-macosx_11_0_universal2.whl", hash = "sha256:cbc3312f134baf07334dd517341a4b470b2931f090bd9284888acb7dfaf4606f"}, - {file = "safetensors-0.3.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:d15030af39d5d30c22bcbc6d180c65405b7ea4c05b7bab14a570eac7d7d43722"}, - {file = "safetensors-0.3.3-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:f84a74cbe9859b28e3d6d7715ac1dd3097bebf8d772694098f6d42435245860c"}, - {file = "safetensors-0.3.3-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:10d637423d98ab2e6a4ad96abf4534eb26fcaf8ca3115623e64c00759374e90d"}, - {file = "safetensors-0.3.3-cp311-cp311-macosx_13_0_universal2.whl", hash = "sha256:3b46f5de8b44084aff2e480874c550c399c730c84b2e8ad1bddb062c94aa14e9"}, - {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e76da691a82dfaf752854fa6d17c8eba0c8466370c5ad8cf1bfdf832d3c7ee17"}, - {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c4e342fd54e66aa9512dd13e410f791e47aa4feeb5f4c9a20882c72f3d272f29"}, - {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:178fd30b5dc73bce14a39187d948cedd0e5698e2f055b7ea16b5a96c9b17438e"}, - {file = "safetensors-0.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3e8fdf7407dba44587ed5e79d5de3533d242648e1f2041760b21474bd5ea5c8c"}, - {file = "safetensors-0.3.3-cp311-cp311-win32.whl", hash = "sha256:7d3b744cee8d7a46ffa68db1a2ff1a1a432488e3f7a5a97856fe69e22139d50c"}, - {file = "safetensors-0.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:f579877d30feec9b6ba409d05fa174633a4fc095675a4a82971d831a8bb60b97"}, - {file = "safetensors-0.3.3-cp37-cp37m-macosx_10_11_x86_64.whl", hash = "sha256:2fff5b19a1b462c17322998b2f4b8bce43c16fe208968174d2f3a1446284ceed"}, - {file = "safetensors-0.3.3-cp37-cp37m-macosx_11_0_x86_64.whl", hash = "sha256:41adb1d39e8aad04b16879e3e0cbcb849315999fad73bc992091a01e379cb058"}, - {file = "safetensors-0.3.3-cp37-cp37m-macosx_12_0_x86_64.whl", hash = "sha256:0f2b404250b3b877b11d34afcc30d80e7035714a1116a3df56acaca6b6c00096"}, - {file = "safetensors-0.3.3-cp37-cp37m-macosx_13_0_x86_64.whl", hash = "sha256:b43956ef20e9f4f2e648818a9e7b3499edd6b753a0f5526d4f6a6826fbee8446"}, - {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d61a99b34169981f088ccfbb2c91170843efc869a0a0532f422db7211bf4f474"}, - {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c0008aab36cd20e9a051a68563c6f80d40f238c2611811d7faa5a18bf3fd3984"}, - {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:93d54166072b143084fdcd214a080a088050c1bb1651016b55942701b31334e4"}, - {file = "safetensors-0.3.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1c32ee08f61cea56a5d62bbf94af95df6040c8ab574afffaeb7b44ae5da1e9e3"}, - {file = "safetensors-0.3.3-cp37-cp37m-win32.whl", hash = "sha256:351600f367badd59f7bfe86d317bb768dd8c59c1561c6fac43cafbd9c1af7827"}, - {file = "safetensors-0.3.3-cp37-cp37m-win_amd64.whl", hash = "sha256:034717e297849dae1af0a7027a14b8647bd2e272c24106dced64d83e10d468d1"}, - {file = "safetensors-0.3.3-cp38-cp38-macosx_10_11_x86_64.whl", hash = "sha256:8530399666748634bc0b301a6a5523756931b0c2680d188e743d16304afe917a"}, - {file = "safetensors-0.3.3-cp38-cp38-macosx_11_0_x86_64.whl", hash = "sha256:9d741c1f1621e489ba10aa3d135b54202684f6e205df52e219d5eecd673a80c9"}, - {file = "safetensors-0.3.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:0c345fd85b4d2093a5109596ff4cd9dfc2e84992e881b4857fbc4a93a3b89ddb"}, - {file = "safetensors-0.3.3-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:69ccee8d05f55cdf76f7e6c87d2bdfb648c16778ef8acfd2ecc495e273e9233e"}, - {file = "safetensors-0.3.3-cp38-cp38-macosx_13_0_arm64.whl", hash = "sha256:c08a9a4b7a4ca389232fa8d097aebc20bbd4f61e477abc7065b5c18b8202dede"}, - {file = "safetensors-0.3.3-cp38-cp38-macosx_13_0_x86_64.whl", hash = "sha256:a002868d2e3f49bbe81bee2655a411c24fa1f8e68b703dec6629cb989d6ae42e"}, - {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3bd2704cb41faa44d3ec23e8b97330346da0395aec87f8eaf9c9e2c086cdbf13"}, - {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4b2951bf3f0ad63df5e6a95263652bd6c194a6eb36fd4f2d29421cd63424c883"}, - {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:07114cec116253ca2e7230fdea30acf76828f21614afd596d7b5438a2f719bd8"}, - {file = "safetensors-0.3.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6ab43aeeb9eadbb6b460df3568a662e6f1911ecc39387f8752afcb6a7d96c087"}, - {file = "safetensors-0.3.3-cp38-cp38-win32.whl", hash = "sha256:f2f59fce31dd3429daca7269a6b06f65e6547a0c248f5116976c3f1e9b73f251"}, - {file = "safetensors-0.3.3-cp38-cp38-win_amd64.whl", hash = "sha256:c31ca0d8610f57799925bf08616856b39518ab772c65093ef1516762e796fde4"}, - {file = "safetensors-0.3.3-cp39-cp39-macosx_10_11_x86_64.whl", hash = "sha256:59a596b3225c96d59af412385981f17dd95314e3fffdf359c7e3f5bb97730a19"}, - {file = "safetensors-0.3.3-cp39-cp39-macosx_11_0_x86_64.whl", hash = "sha256:82a16e92210a6221edd75ab17acdd468dd958ef5023d9c6c1289606cc30d1479"}, - {file = "safetensors-0.3.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:98a929e763a581f516373ef31983ed1257d2d0da912a8e05d5cd12e9e441c93a"}, - {file = "safetensors-0.3.3-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:12b83f1986cd16ea0454c636c37b11e819d60dd952c26978310a0835133480b7"}, - {file = "safetensors-0.3.3-cp39-cp39-macosx_13_0_arm64.whl", hash = "sha256:f439175c827c2f1bbd54df42789c5204a10983a30bc4242bc7deaf854a24f3f0"}, - {file = "safetensors-0.3.3-cp39-cp39-macosx_13_0_x86_64.whl", hash = "sha256:0085be33b8cbcb13079b3a8e131656e05b0bc5e6970530d4c24150f7afd76d70"}, - {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3e3ec70c87b1e910769034206ad5efc051069b105aac1687f6edcd02526767f4"}, - {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f490132383e5e490e710608f4acffcb98ed37f91b885c7217d3f9f10aaff9048"}, - {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:79d1b6c7ed5596baf79c80fbce5198c3cdcc521ae6a157699f427aba1a90082d"}, - {file = "safetensors-0.3.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad3cc8006e7a86ee7c88bd2813ec59cd7cc75b03e6fa4af89b9c7b235b438d68"}, - {file = "safetensors-0.3.3-cp39-cp39-win32.whl", hash = "sha256:ab29f54c6b8c301ca05fa014728996bd83aac6e21528f893aaf8945c71f42b6d"}, - {file = "safetensors-0.3.3-cp39-cp39-win_amd64.whl", hash = "sha256:0fa82004eae1a71e2aa29843ef99de9350e459a0fc2f65fc6ee0da9690933d2d"}, - {file = "safetensors-0.3.3.tar.gz", hash = "sha256:edb7072d788c4f929d0f5735d3a2fb51e5a27f833587828583b7f5747af1a2b8"}, -] - -[package.extras] -all = ["black (==22.3)", "click (==8.0.4)", "flake8 (>=3.8.3)", "flax (>=0.6.3)", "h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "isort (>=5.5.4)", "jax (>=0.3.25)", "jaxlib (>=0.3.25)", "numpy (>=1.21.6)", "paddlepaddle (>=2.4.1)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "setuptools-rust (>=1.5.2)", "tensorflow (==2.11.0)", "torch (>=1.10)"] -dev = ["black (==22.3)", "click (==8.0.4)", "flake8 (>=3.8.3)", "flax (>=0.6.3)", "h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "isort (>=5.5.4)", "jax (>=0.3.25)", "jaxlib (>=0.3.25)", "numpy (>=1.21.6)", "paddlepaddle (>=2.4.1)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "setuptools-rust (>=1.5.2)", "tensorflow (==2.11.0)", "torch (>=1.10)"] -jax = ["flax (>=0.6.3)", "jax (>=0.3.25)", "jaxlib (>=0.3.25)", "numpy (>=1.21.6)"] -numpy = ["numpy (>=1.21.6)"] -paddlepaddle = ["numpy (>=1.21.6)", "paddlepaddle (>=2.4.1)"] -pinned-tf = ["tensorflow (==2.11.0)"] -quality = ["black (==22.3)", "click (==8.0.4)", "flake8 (>=3.8.3)", "isort (>=5.5.4)"] -tensorflow = ["numpy (>=1.21.6)", "tensorflow (>=2.11.0)"] -testing = ["h5py (>=3.7.0)", "huggingface-hub (>=0.12.1)", "numpy (>=1.21.6)", "pytest (>=7.2.0)", "pytest-benchmark (>=4.0.0)", "setuptools-rust (>=1.5.2)"] -torch = ["numpy (>=1.21.6)", "torch (>=1.10)"] - -[[package]] -name = "scikit-learn" -version = "1.3.0" -description = "A set of python modules for machine learning and data mining" -optional = true -python-versions = ">=3.8" -files = [ - {file = "scikit-learn-1.3.0.tar.gz", hash = "sha256:8be549886f5eda46436b6e555b0e4873b4f10aa21c07df45c4bc1735afbccd7a"}, - {file = "scikit_learn-1.3.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:981287869e576d42c682cf7ca96af0c6ac544ed9316328fd0d9292795c742cf5"}, - {file = "scikit_learn-1.3.0-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:436aaaae2c916ad16631142488e4c82f4296af2404f480e031d866863425d2a2"}, - {file = "scikit_learn-1.3.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c7e28d8fa47a0b30ae1bd7a079519dd852764e31708a7804da6cb6f8b36e3630"}, - {file = "scikit_learn-1.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae80c08834a473d08a204d966982a62e11c976228d306a2648c575e3ead12111"}, - {file = "scikit_learn-1.3.0-cp310-cp310-win_amd64.whl", hash = "sha256:552fd1b6ee22900cf1780d7386a554bb96949e9a359999177cf30211e6b20df6"}, - {file = "scikit_learn-1.3.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:79970a6d759eb00a62266a31e2637d07d2d28446fca8079cf9afa7c07b0427f8"}, - {file = "scikit_learn-1.3.0-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:850a00b559e636b23901aabbe79b73dc604b4e4248ba9e2d6e72f95063765603"}, - {file = "scikit_learn-1.3.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ee04835fb016e8062ee9fe9074aef9b82e430504e420bff51e3e5fffe72750ca"}, - {file = "scikit_learn-1.3.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d953531f5d9f00c90c34fa3b7d7cfb43ecff4c605dac9e4255a20b114a27369"}, - {file = "scikit_learn-1.3.0-cp311-cp311-win_amd64.whl", hash = "sha256:151ac2bf65ccf363664a689b8beafc9e6aae36263db114b4ca06fbbbf827444a"}, - {file = "scikit_learn-1.3.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:6a885a9edc9c0a341cab27ec4f8a6c58b35f3d449c9d2503a6fd23e06bbd4f6a"}, - {file = "scikit_learn-1.3.0-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:9877af9c6d1b15486e18a94101b742e9d0d2f343d35a634e337411ddb57783f3"}, - {file = "scikit_learn-1.3.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c470f53cea065ff3d588050955c492793bb50c19a92923490d18fcb637f6383a"}, - {file = "scikit_learn-1.3.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fd6e2d7389542eae01077a1ee0318c4fec20c66c957f45c7aac0c6eb0fe3c612"}, - {file = "scikit_learn-1.3.0-cp38-cp38-win_amd64.whl", hash = "sha256:3a11936adbc379a6061ea32fa03338d4ca7248d86dd507c81e13af428a5bc1db"}, - {file = "scikit_learn-1.3.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:998d38fcec96584deee1e79cd127469b3ad6fefd1ea6c2dfc54e8db367eb396b"}, - {file = "scikit_learn-1.3.0-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:ded35e810438a527e17623ac6deae3b360134345b7c598175ab7741720d7ffa7"}, - {file = "scikit_learn-1.3.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e8102d5036e28d08ab47166b48c8d5e5810704daecf3a476a4282d562be9a28"}, - {file = "scikit_learn-1.3.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7617164951c422747e7c32be4afa15d75ad8044f42e7d70d3e2e0429a50e6718"}, - {file = "scikit_learn-1.3.0-cp39-cp39-win_amd64.whl", hash = "sha256:1d54fb9e6038284548072df22fd34777e434153f7ffac72c8596f2d6987110dd"}, -] - -[package.dependencies] -joblib = ">=1.1.1" -numpy = ">=1.17.3" -scipy = ">=1.5.0" -threadpoolctl = ">=2.0.0" - -[package.extras] -benchmark = ["matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "pandas (>=1.0.5)"] -docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)", "sphinx (>=6.0.0)", "sphinx-copybutton (>=0.5.2)", "sphinx-gallery (>=0.10.1)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)"] -examples = ["matplotlib (>=3.1.3)", "pandas (>=1.0.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)"] -tests = ["black (>=23.3.0)", "matplotlib (>=3.1.3)", "mypy (>=1.3)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.0.272)", "scikit-image (>=0.16.2)"] - -[[package]] -name = "scipy" -version = "1.9.3" -description = "Fundamental algorithms for scientific computing in Python" -optional = true -python-versions = ">=3.8" -files = [ - {file = "scipy-1.9.3-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1884b66a54887e21addf9c16fb588720a8309a57b2e258ae1c7986d4444d3bc0"}, - {file = "scipy-1.9.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:83b89e9586c62e787f5012e8475fbb12185bafb996a03257e9675cd73d3736dd"}, - {file = "scipy-1.9.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1a72d885fa44247f92743fc20732ae55564ff2a519e8302fb7e18717c5355a8b"}, - {file = "scipy-1.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d01e1dd7b15bd2449c8bfc6b7cc67d630700ed655654f0dfcf121600bad205c9"}, - {file = "scipy-1.9.3-cp310-cp310-win_amd64.whl", hash = "sha256:68239b6aa6f9c593da8be1509a05cb7f9efe98b80f43a5861cd24c7557e98523"}, - {file = "scipy-1.9.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:b41bc822679ad1c9a5f023bc93f6d0543129ca0f37c1ce294dd9d386f0a21096"}, - {file = "scipy-1.9.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:90453d2b93ea82a9f434e4e1cba043e779ff67b92f7a0e85d05d286a3625df3c"}, - {file = "scipy-1.9.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:83c06e62a390a9167da60bedd4575a14c1f58ca9dfde59830fc42e5197283dab"}, - {file = "scipy-1.9.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abaf921531b5aeaafced90157db505e10345e45038c39e5d9b6c7922d68085cb"}, - {file = "scipy-1.9.3-cp311-cp311-win_amd64.whl", hash = "sha256:06d2e1b4c491dc7d8eacea139a1b0b295f74e1a1a0f704c375028f8320d16e31"}, - {file = "scipy-1.9.3-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:5a04cd7d0d3eff6ea4719371cbc44df31411862b9646db617c99718ff68d4840"}, - {file = "scipy-1.9.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:545c83ffb518094d8c9d83cce216c0c32f8c04aaf28b92cc8283eda0685162d5"}, - {file = "scipy-1.9.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0d54222d7a3ba6022fdf5773931b5d7c56efe41ede7f7128c7b1637700409108"}, - {file = "scipy-1.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cff3a5295234037e39500d35316a4c5794739433528310e117b8a9a0c76d20fc"}, - {file = "scipy-1.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:2318bef588acc7a574f5bfdff9c172d0b1bf2c8143d9582e05f878e580a3781e"}, - {file = "scipy-1.9.3-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d644a64e174c16cb4b2e41dfea6af722053e83d066da7343f333a54dae9bc31c"}, - {file = "scipy-1.9.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:da8245491d73ed0a994ed9c2e380fd058ce2fa8a18da204681f2fe1f57f98f95"}, - {file = "scipy-1.9.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4db5b30849606a95dcf519763dd3ab6fe9bd91df49eba517359e450a7d80ce2e"}, - {file = "scipy-1.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c68db6b290cbd4049012990d7fe71a2abd9ffbe82c0056ebe0f01df8be5436b0"}, - {file = "scipy-1.9.3-cp39-cp39-win_amd64.whl", hash = "sha256:5b88e6d91ad9d59478fafe92a7c757d00c59e3bdc3331be8ada76a4f8d683f58"}, - {file = "scipy-1.9.3.tar.gz", hash = "sha256:fbc5c05c85c1a02be77b1ff591087c83bc44579c6d2bd9fb798bb64ea5e1a027"}, -] - -[package.dependencies] -numpy = ">=1.18.5,<1.26.0" - -[package.extras] -dev = ["flake8", "mypy", "pycodestyle", "typing_extensions"] -doc = ["matplotlib (>2)", "numpydoc", "pydata-sphinx-theme (==0.9.0)", "sphinx (!=4.1.0)", "sphinx-panels (>=0.5.2)", "sphinx-tabs"] -test = ["asv", "gmpy2", "mpmath", "pytest", "pytest-cov", "pytest-xdist", "scikit-umfpack", "threadpoolctl"] - [[package]] name = "send2trash" version = "1.8.2" @@ -3629,82 +3192,6 @@ nativelib = ["pyobjc-framework-Cocoa", "pywin32"] objc = ["pyobjc-framework-Cocoa"] win32 = ["pywin32"] -[[package]] -name = "sentence-transformers" -version = "2.2.2" -description = "Multilingual text embeddings" -optional = true -python-versions = ">=3.6.0" -files = [ - {file = "sentence-transformers-2.2.2.tar.gz", hash = "sha256:dbc60163b27de21076c9a30d24b5b7b6fa05141d68cf2553fa9a77bf79a29136"}, -] - -[package.dependencies] -huggingface-hub = ">=0.4.0" -nltk = "*" -numpy = "*" -scikit-learn = "*" -scipy = "*" -sentencepiece = "*" -torch = ">=1.6.0" -torchvision = "*" -tqdm = "*" -transformers = ">=4.6.0,<5.0.0" - -[[package]] -name = "sentencepiece" -version = "0.1.99" -description = "SentencePiece python wrapper" -optional = true -python-versions = "*" -files = [ - {file = "sentencepiece-0.1.99-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0eb528e70571b7c02723e5804322469b82fe7ea418c96051d0286c0fa028db73"}, - {file = "sentencepiece-0.1.99-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:77d7fafb2c4e4659cbdf303929503f37a26eabc4ff31d3a79bf1c5a1b338caa7"}, - {file = "sentencepiece-0.1.99-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:be9cf5b9e404c245aeb3d3723c737ba7a8f5d4ba262ef233a431fa6c45f732a0"}, - {file = "sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:baed1a26464998f9710d20e52607c29ffd4293e7c71c6a1f83f51ad0911ec12c"}, - {file = "sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9832f08bb372d4c8b567612f8eab9e36e268dff645f1c28f9f8e851be705f6d1"}, - {file = "sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:019e7535108e309dae2b253a75834fc3128240aa87c00eb80732078cdc182588"}, - {file = "sentencepiece-0.1.99-cp310-cp310-win32.whl", hash = "sha256:fa16a830416bb823fa2a52cbdd474d1f7f3bba527fd2304fb4b140dad31bb9bc"}, - {file = "sentencepiece-0.1.99-cp310-cp310-win_amd64.whl", hash = "sha256:14b0eccb7b641d4591c3e12ae44cab537d68352e4d3b6424944f0c447d2348d5"}, - {file = "sentencepiece-0.1.99-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6d3c56f24183a1e8bd61043ff2c58dfecdc68a5dd8955dc13bab83afd5f76b81"}, - {file = "sentencepiece-0.1.99-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ed6ea1819fd612c989999e44a51bf556d0ef6abfb553080b9be3d347e18bcfb7"}, - {file = "sentencepiece-0.1.99-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2a0260cd1fb7bd8b4d4f39dc2444a8d5fd4e0a0c4d5c899810ef1abf99b2d45"}, - {file = "sentencepiece-0.1.99-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8a1abff4d1ff81c77cac3cc6fefa34fa4b8b371e5ee51cb7e8d1ebc996d05983"}, - {file = "sentencepiece-0.1.99-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:004e6a621d4bc88978eecb6ea7959264239a17b70f2cbc348033d8195c9808ec"}, - {file = "sentencepiece-0.1.99-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:db361e03342c41680afae5807590bc88aa0e17cfd1a42696a160e4005fcda03b"}, - {file = "sentencepiece-0.1.99-cp311-cp311-win32.whl", hash = "sha256:2d95e19168875b70df62916eb55428a0cbcb834ac51d5a7e664eda74def9e1e0"}, - {file = "sentencepiece-0.1.99-cp311-cp311-win_amd64.whl", hash = "sha256:f90d73a6f81248a909f55d8e6ef56fec32d559e1e9af045f0b0322637cb8e5c7"}, - {file = "sentencepiece-0.1.99-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:62e24c81e74bd87a6e0d63c51beb6527e4c0add67e1a17bac18bcd2076afcfeb"}, - {file = "sentencepiece-0.1.99-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:57efcc2d51caff20d9573567d9fd3f854d9efe613ed58a439c78c9f93101384a"}, - {file = "sentencepiece-0.1.99-cp36-cp36m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6a904c46197993bd1e95b93a6e373dca2f170379d64441041e2e628ad4afb16f"}, - {file = "sentencepiece-0.1.99-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d89adf59854741c0d465f0e1525b388c0d174f611cc04af54153c5c4f36088c4"}, - {file = "sentencepiece-0.1.99-cp36-cp36m-win32.whl", hash = "sha256:47c378146928690d1bc106fdf0da768cebd03b65dd8405aa3dd88f9c81e35dba"}, - {file = "sentencepiece-0.1.99-cp36-cp36m-win_amd64.whl", hash = "sha256:9ba142e7a90dd6d823c44f9870abdad45e6c63958eb60fe44cca6828d3b69da2"}, - {file = "sentencepiece-0.1.99-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b7b1a9ae4d7c6f1f867e63370cca25cc17b6f4886729595b885ee07a58d3cec3"}, - {file = "sentencepiece-0.1.99-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d0f644c9d4d35c096a538507b2163e6191512460035bf51358794a78515b74f7"}, - {file = "sentencepiece-0.1.99-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c8843d23a0f686d85e569bd6dcd0dd0e0cbc03731e63497ca6d5bacd18df8b85"}, - {file = "sentencepiece-0.1.99-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:33e6f690a1caebb4867a2e367afa1918ad35be257ecdb3455d2bbd787936f155"}, - {file = "sentencepiece-0.1.99-cp37-cp37m-win32.whl", hash = "sha256:8a321866c2f85da7beac74a824b4ad6ddc2a4c9bccd9382529506d48f744a12c"}, - {file = "sentencepiece-0.1.99-cp37-cp37m-win_amd64.whl", hash = "sha256:c42f753bcfb7661c122a15b20be7f684b61fc8592c89c870adf52382ea72262d"}, - {file = "sentencepiece-0.1.99-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:85b476406da69c70586f0bb682fcca4c9b40e5059814f2db92303ea4585c650c"}, - {file = "sentencepiece-0.1.99-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cfbcfe13c69d3f87b7fcd5da168df7290a6d006329be71f90ba4f56bc77f8561"}, - {file = "sentencepiece-0.1.99-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:445b0ec381af1cd4eef95243e7180c63d9c384443c16c4c47a28196bd1cda937"}, - {file = "sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c6890ea0f2b4703f62d0bf27932e35808b1f679bdb05c7eeb3812b935ba02001"}, - {file = "sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fb71af492b0eefbf9f2501bec97bcd043b6812ab000d119eaf4bd33f9e283d03"}, - {file = "sentencepiece-0.1.99-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:27b866b5bd3ddd54166bbcbf5c8d7dd2e0b397fac8537991c7f544220b1f67bc"}, - {file = "sentencepiece-0.1.99-cp38-cp38-win32.whl", hash = "sha256:b133e8a499eac49c581c3c76e9bdd08c338cc1939e441fee6f92c0ccb5f1f8be"}, - {file = "sentencepiece-0.1.99-cp38-cp38-win_amd64.whl", hash = "sha256:0eaf3591dd0690a87f44f4df129cf8d05d8a4029b5b6709b489b8e27f9a9bcff"}, - {file = "sentencepiece-0.1.99-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:38efeda9bbfb55052d482a009c6a37e52f42ebffcea9d3a98a61de7aee356a28"}, - {file = "sentencepiece-0.1.99-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6c030b081dc1e1bcc9fadc314b19b740715d3d566ad73a482da20d7d46fd444c"}, - {file = "sentencepiece-0.1.99-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:84dbe53e02e4f8a2e45d2ac3e430d5c83182142658e25edd76539b7648928727"}, - {file = "sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0b0f55d0a0ee1719b4b04221fe0c9f0c3461dc3dabd77a035fa2f4788eb3ef9a"}, - {file = "sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18e800f206cd235dc27dc749299e05853a4e4332e8d3dfd81bf13d0e5b9007d9"}, - {file = "sentencepiece-0.1.99-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2ae1c40cda8f9d5b0423cfa98542735c0235e7597d79caf318855cdf971b2280"}, - {file = "sentencepiece-0.1.99-cp39-cp39-win32.whl", hash = "sha256:c84ce33af12ca222d14a1cdd37bd76a69401e32bc68fe61c67ef6b59402f4ab8"}, - {file = "sentencepiece-0.1.99-cp39-cp39-win_amd64.whl", hash = "sha256:350e5c74d739973f1c9643edb80f7cc904dc948578bcb1d43c6f2b173e5d18dd"}, - {file = "sentencepiece-0.1.99.tar.gz", hash = "sha256:189c48f5cb2949288f97ccdb97f0473098d9c3dcf5a3d99d4eabe719ec27297f"}, -] - [[package]] name = "setuptools" version = "67.8.0" @@ -3766,13 +3253,13 @@ files = [ [[package]] name = "soupsieve" -version = "2.5" +version = "2.4.1" description = "A modern CSS selector implementation for Beautiful Soup." optional = false -python-versions = ">=3.8" +python-versions = ">=3.7" files = [ - {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"}, - {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, + {file = "soupsieve-2.4.1-py3-none-any.whl", hash = "sha256:1c1bfee6819544a3447586c889157365a27e10d88cde3ad3da0cf0ddf646feb8"}, + {file = "soupsieve-2.4.1.tar.gz", hash = "sha256:89d12b2d5dfcd2c9e8c22326da9d9aa9cb3dfab0a83a024f05704076ee8d35ea"}, ] [[package]] @@ -4021,20 +3508,6 @@ pure-eval = "*" [package.extras] tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] -[[package]] -name = "sympy" -version = "1.12" -description = "Computer algebra system (CAS) in Python" -optional = true -python-versions = ">=3.8" -files = [ - {file = "sympy-1.12-py3-none-any.whl", hash = "sha256:c3588cd4295d0c0f603d0f2ae780587e64e2efeedb3521e46b9bb1d08d184fa5"}, - {file = "sympy-1.12.tar.gz", hash = "sha256:ebf595c8dac3e0fdc4152c51878b498396ec7f30e7a914d6071e674d49420fb8"}, -] - -[package.dependencies] -mpmath = ">=0.19" - [[package]] name = "tenacity" version = "8.2.3" @@ -4147,17 +3620,6 @@ mxnet = ["mxnet (>=1.5.1,<1.6.0)"] tensorflow = ["tensorflow (>=2.0.0,<2.6.0)"] torch = ["torch (>=1.6.0)"] -[[package]] -name = "threadpoolctl" -version = "3.2.0" -description = "threadpoolctl" -optional = true -python-versions = ">=3.8" -files = [ - {file = "threadpoolctl-3.2.0-py3-none-any.whl", hash = "sha256:2b7818516e423bdaebb97c723f86a7c6b0a83d3f3b0970328d66f4d9104dc032"}, - {file = "threadpoolctl-3.2.0.tar.gz", hash = "sha256:c96a0ba3bdddeaca37dc4cc7344aafad41cdb8c313f74fdfe387a867bba93355"}, -] - [[package]] name = "tinycss2" version = "1.2.1" @@ -4193,60 +3655,6 @@ idna = "*" requests = ">=2.1.0" requests-file = ">=1.4" -[[package]] -name = "tokenizers" -version = "0.13.3" -description = "Fast and Customizable Tokenizers" -optional = true -python-versions = "*" -files = [ - {file = "tokenizers-0.13.3-cp310-cp310-macosx_10_11_x86_64.whl", hash = "sha256:f3835c5be51de8c0a092058a4d4380cb9244fb34681fd0a295fbf0a52a5fdf33"}, - {file = "tokenizers-0.13.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:4ef4c3e821730f2692489e926b184321e887f34fb8a6b80b8096b966ba663d07"}, - {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c5fd1a6a25353e9aa762e2aae5a1e63883cad9f4e997c447ec39d071020459bc"}, - {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ee0b1b311d65beab83d7a41c56a1e46ab732a9eed4460648e8eb0bd69fc2d059"}, - {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ef4215284df1277dadbcc5e17d4882bda19f770d02348e73523f7e7d8b8d396"}, - {file = "tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a4d53976079cff8a033f778fb9adca2d9d69d009c02fa2d71a878b5f3963ed30"}, - {file = "tokenizers-0.13.3-cp310-cp310-win32.whl", hash = "sha256:1f0e3b4c2ea2cd13238ce43548959c118069db7579e5d40ec270ad77da5833ce"}, - {file = "tokenizers-0.13.3-cp310-cp310-win_amd64.whl", hash = "sha256:89649c00d0d7211e8186f7a75dfa1db6996f65edce4b84821817eadcc2d3c79e"}, - {file = "tokenizers-0.13.3-cp311-cp311-macosx_10_11_universal2.whl", hash = "sha256:56b726e0d2bbc9243872b0144515ba684af5b8d8cd112fb83ee1365e26ec74c8"}, - {file = "tokenizers-0.13.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:cc5c022ce692e1f499d745af293ab9ee6f5d92538ed2faf73f9708c89ee59ce6"}, - {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f55c981ac44ba87c93e847c333e58c12abcbb377a0c2f2ef96e1a266e4184ff2"}, - {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f247eae99800ef821a91f47c5280e9e9afaeed9980fc444208d5aa6ba69ff148"}, - {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4b3e3215d048e94f40f1c95802e45dcc37c5b05eb46280fc2ccc8cd351bff839"}, - {file = "tokenizers-0.13.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9ba2b0bf01777c9b9bc94b53764d6684554ce98551fec496f71bc5be3a03e98b"}, - {file = "tokenizers-0.13.3-cp311-cp311-win32.whl", hash = "sha256:cc78d77f597d1c458bf0ea7c2a64b6aa06941c7a99cb135b5969b0278824d808"}, - {file = "tokenizers-0.13.3-cp311-cp311-win_amd64.whl", hash = "sha256:ecf182bf59bd541a8876deccf0360f5ae60496fd50b58510048020751cf1724c"}, - {file = "tokenizers-0.13.3-cp37-cp37m-macosx_10_11_x86_64.whl", hash = "sha256:0527dc5436a1f6bf2c0327da3145687d3bcfbeab91fed8458920093de3901b44"}, - {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:07cbb2c307627dc99b44b22ef05ff4473aa7c7cc1fec8f0a8b37d8a64b1a16d2"}, - {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4560dbdeaae5b7ee0d4e493027e3de6d53c991b5002d7ff95083c99e11dd5ac0"}, - {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:64064bd0322405c9374305ab9b4c07152a1474370327499911937fd4a76d004b"}, - {file = "tokenizers-0.13.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b8c6e2ab0f2e3d939ca66aa1d596602105fe33b505cd2854a4c1717f704c51de"}, - {file = "tokenizers-0.13.3-cp37-cp37m-win32.whl", hash = "sha256:6cc29d410768f960db8677221e497226e545eaaea01aa3613fa0fdf2cc96cff4"}, - {file = "tokenizers-0.13.3-cp37-cp37m-win_amd64.whl", hash = "sha256:fc2a7fdf864554a0dacf09d32e17c0caa9afe72baf9dd7ddedc61973bae352d8"}, - {file = "tokenizers-0.13.3-cp38-cp38-macosx_10_11_x86_64.whl", hash = "sha256:8791dedba834c1fc55e5f1521be325ea3dafb381964be20684b92fdac95d79b7"}, - {file = "tokenizers-0.13.3-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:d607a6a13718aeb20507bdf2b96162ead5145bbbfa26788d6b833f98b31b26e1"}, - {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3791338f809cd1bf8e4fee6b540b36822434d0c6c6bc47162448deee3f77d425"}, - {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c2f35f30e39e6aab8716f07790f646bdc6e4a853816cc49a95ef2a9016bf9ce6"}, - {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:310204dfed5aa797128b65d63538a9837cbdd15da2a29a77d67eefa489edda26"}, - {file = "tokenizers-0.13.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a0f9b92ea052305166559f38498b3b0cae159caea712646648aaa272f7160963"}, - {file = "tokenizers-0.13.3-cp38-cp38-win32.whl", hash = "sha256:9a3fa134896c3c1f0da6e762d15141fbff30d094067c8f1157b9fdca593b5806"}, - {file = "tokenizers-0.13.3-cp38-cp38-win_amd64.whl", hash = "sha256:8e7b0cdeace87fa9e760e6a605e0ae8fc14b7d72e9fc19c578116f7287bb873d"}, - {file = "tokenizers-0.13.3-cp39-cp39-macosx_10_11_x86_64.whl", hash = "sha256:00cee1e0859d55507e693a48fa4aef07060c4bb6bd93d80120e18fea9371c66d"}, - {file = "tokenizers-0.13.3-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:a23ff602d0797cea1d0506ce69b27523b07e70f6dda982ab8cf82402de839088"}, - {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:70ce07445050b537d2696022dafb115307abdffd2a5c106f029490f84501ef97"}, - {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:280ffe95f50eaaf655b3a1dc7ff1d9cf4777029dbbc3e63a74e65a056594abc3"}, - {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:97acfcec592f7e9de8cadcdcda50a7134423ac8455c0166b28c9ff04d227b371"}, - {file = "tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dd7730c98a3010cd4f523465867ff95cd9d6430db46676ce79358f65ae39797b"}, - {file = "tokenizers-0.13.3-cp39-cp39-win32.whl", hash = "sha256:48625a108029cb1ddf42e17a81b5a3230ba6888a70c9dc14e81bc319e812652d"}, - {file = "tokenizers-0.13.3-cp39-cp39-win_amd64.whl", hash = "sha256:bc0a6f1ba036e482db6453571c9e3e60ecd5489980ffd95d11dc9f960483d783"}, - {file = "tokenizers-0.13.3.tar.gz", hash = "sha256:2e546dbb68b623008a5442353137fbb0123d311a6d7ba52f2667c8862a75af2e"}, -] - -[package.extras] -dev = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"] -docs = ["setuptools-rust", "sphinx", "sphinx-rtd-theme"] -testing = ["black (==22.3)", "datasets", "numpy", "pytest", "requests"] - [[package]] name = "tomli" version = "2.0.1" @@ -4258,83 +3666,6 @@ files = [ {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, ] -[[package]] -name = "torch" -version = "2.0.1" -description = "Tensors and Dynamic neural networks in Python with strong GPU acceleration" -optional = true -python-versions = ">=3.8.0" -files = [ - {file = "torch-2.0.1-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:8ced00b3ba471856b993822508f77c98f48a458623596a4c43136158781e306a"}, - {file = "torch-2.0.1-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:359bfaad94d1cda02ab775dc1cc386d585712329bb47b8741607ef6ef4950747"}, - {file = "torch-2.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:7c84e44d9002182edd859f3400deaa7410f5ec948a519cc7ef512c2f9b34d2c4"}, - {file = "torch-2.0.1-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:567f84d657edc5582d716900543e6e62353dbe275e61cdc36eda4929e46df9e7"}, - {file = "torch-2.0.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:787b5a78aa7917465e9b96399b883920c88a08f4eb63b5a5d2d1a16e27d2f89b"}, - {file = "torch-2.0.1-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:e617b1d0abaf6ced02dbb9486803abfef0d581609b09641b34fa315c9c40766d"}, - {file = "torch-2.0.1-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:b6019b1de4978e96daa21d6a3ebb41e88a0b474898fe251fd96189587408873e"}, - {file = "torch-2.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:dbd68cbd1cd9da32fe5d294dd3411509b3d841baecb780b38b3b7b06c7754434"}, - {file = "torch-2.0.1-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:ef654427d91600129864644e35deea761fb1fe131710180b952a6f2e2207075e"}, - {file = "torch-2.0.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:25aa43ca80dcdf32f13da04c503ec7afdf8e77e3a0183dd85cd3e53b2842e527"}, - {file = "torch-2.0.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:5ef3ea3d25441d3957348f7e99c7824d33798258a2bf5f0f0277cbcadad2e20d"}, - {file = "torch-2.0.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:0882243755ff28895e8e6dc6bc26ebcf5aa0911ed81b2a12f241fc4b09075b13"}, - {file = "torch-2.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:f66aa6b9580a22b04d0af54fcd042f52406a8479e2b6a550e3d9f95963e168c8"}, - {file = "torch-2.0.1-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:1adb60d369f2650cac8e9a95b1d5758e25d526a34808f7448d0bd599e4ae9072"}, - {file = "torch-2.0.1-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:1bcffc16b89e296826b33b98db5166f990e3b72654a2b90673e817b16c50e32b"}, - {file = "torch-2.0.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:e10e1597f2175365285db1b24019eb6f04d53dcd626c735fc502f1e8b6be9875"}, - {file = "torch-2.0.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:423e0ae257b756bb45a4b49072046772d1ad0c592265c5080070e0767da4e490"}, - {file = "torch-2.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:8742bdc62946c93f75ff92da00e3803216c6cce9b132fbca69664ca38cfb3e18"}, - {file = "torch-2.0.1-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:c62df99352bd6ee5a5a8d1832452110435d178b5164de450831a3a8cc14dc680"}, - {file = "torch-2.0.1-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:671a2565e3f63b8fe8e42ae3e36ad249fe5e567435ea27b94edaa672a7d0c416"}, -] - -[package.dependencies] -filelock = "*" -jinja2 = "*" -networkx = "*" -sympy = "*" -typing-extensions = "*" - -[package.extras] -opt-einsum = ["opt-einsum (>=3.3)"] - -[[package]] -name = "torchvision" -version = "0.15.2" -description = "image and video datasets and models for torch deep learning" -optional = true -python-versions = ">=3.8" -files = [ - {file = "torchvision-0.15.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7754088774e810c5672b142a45dcf20b1bd986a5a7da90f8660c43dc43fb850c"}, - {file = "torchvision-0.15.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:37eb138e13f6212537a3009ac218695483a635c404b6cc1d8e0d0d978026a86d"}, - {file = "torchvision-0.15.2-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:54143f7cc0797d199b98a53b7d21c3f97615762d4dd17ad45a41c7e80d880e73"}, - {file = "torchvision-0.15.2-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:1eefebf5fbd01a95fe8f003d623d941601c94b5cec547b420da89cb369d9cf96"}, - {file = "torchvision-0.15.2-cp310-cp310-win_amd64.whl", hash = "sha256:96fae30c5ca8423f4b9790df0f0d929748e32718d88709b7b567d2f630c042e3"}, - {file = "torchvision-0.15.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5f35f6bd5bcc4568e6522e4137fa60fcc72f4fa3e615321c26cd87e855acd398"}, - {file = "torchvision-0.15.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:757505a0ab2be7096cb9d2bf4723202c971cceddb72c7952a7e877f773de0f8a"}, - {file = "torchvision-0.15.2-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:012ad25cfd9019ff9b0714a168727e3845029be1af82296ff1e1482931fa4b80"}, - {file = "torchvision-0.15.2-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:b02a7ffeaa61448737f39a4210b8ee60234bda0515a0c0d8562f884454105b0f"}, - {file = "torchvision-0.15.2-cp311-cp311-win_amd64.whl", hash = "sha256:10be76ceded48329d0a0355ac33da131ee3993ff6c125e4a02ab34b5baa2472c"}, - {file = "torchvision-0.15.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8f12415b686dba884fb086f53ac803f692be5a5cdd8a758f50812b30fffea2e4"}, - {file = "torchvision-0.15.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:31211c01f8b8ec33b8a638327b5463212e79a03e43c895f88049f97af1bd12fd"}, - {file = "torchvision-0.15.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:c55f9889e436f14b4f84a9c00ebad0d31f5b4626f10cf8018e6c676f92a6d199"}, - {file = "torchvision-0.15.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:9a192f2aa979438f23c20e883980b23d13268ab9f819498774a6d2eb021802c2"}, - {file = "torchvision-0.15.2-cp38-cp38-win_amd64.whl", hash = "sha256:c07071bc8d02aa8fcdfe139ab6a1ef57d3b64c9e30e84d12d45c9f4d89fb6536"}, - {file = "torchvision-0.15.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4790260fcf478a41c7ecc60a6d5200a88159fdd8d756e9f29f0f8c59c4a67a68"}, - {file = "torchvision-0.15.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:987ab62225b4151a11e53fd06150c5258ced24ac9d7c547e0e4ab6fbca92a5ce"}, - {file = "torchvision-0.15.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:63df26673e66cba3f17e07c327a8cafa3cce98265dbc3da329f1951d45966838"}, - {file = "torchvision-0.15.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:b85f98d4cc2f72452f6792ab4463a3541bc5678a8cdd3da0e139ba2fe8b56d42"}, - {file = "torchvision-0.15.2-cp39-cp39-win_amd64.whl", hash = "sha256:07c462524cc1bba5190c16a9d47eac1fca024d60595a310f23c00b4ffff18b30"}, -] - -[package.dependencies] -numpy = "*" -pillow = ">=5.3.0,<8.3.dev0 || >=8.4.dev0" -requests = "*" -torch = "2.0.1" - -[package.extras] -scipy = ["scipy"] - [[package]] name = "tornado" version = "6.3.3" @@ -4390,75 +3721,6 @@ files = [ docs = ["myst-parser", "pydata-sphinx-theme", "sphinx"] test = ["argcomplete (>=2.0)", "pre-commit", "pytest", "pytest-mock"] -[[package]] -name = "transformers" -version = "4.33.1" -description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" -optional = true -python-versions = ">=3.8.0" -files = [ - {file = "transformers-4.33.1-py3-none-any.whl", hash = "sha256:0630c2d26448d7c6cb78435e6c43910c89e99387badea6be1f565ffa3f093f1d"}, - {file = "transformers-4.33.1.tar.gz", hash = "sha256:744265e9f0724d22c229938f28376af54abce730ef647f35bd1685abf49912a4"}, -] - -[package.dependencies] -filelock = "*" -huggingface-hub = ">=0.15.1,<1.0" -numpy = ">=1.17" -packaging = ">=20.0" -pyyaml = ">=5.1" -regex = "!=2019.12.17" -requests = "*" -safetensors = ">=0.3.1" -tokenizers = ">=0.11.1,<0.11.3 || >0.11.3,<0.14" -tqdm = ">=4.27" - -[package.extras] -accelerate = ["accelerate (>=0.20.3)"] -agents = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=1.10,!=1.12.0)"] -all = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision"] -audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] -codecarbon = ["codecarbon (==1.2.0)"] -deepspeed = ["accelerate (>=0.20.3)", "deepspeed (>=0.9.3)"] -deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.20.3)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "optuna", "parameterized", "protobuf", "psutil", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "timeout-decorator"] -dev = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "beautifulsoup4", "black (>=23.1,<24.0)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] -dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "urllib3 (<2.0.0)"] -dev-torch = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "accelerate (>=0.20.3)", "beautifulsoup4", "black (>=23.1,<24.0)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "timeout-decorator", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] -docs = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "hf-doc-builder", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timm", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision"] -docs-specific = ["hf-doc-builder"] -fairscale = ["fairscale (>0.3)"] -flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)"] -flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] -ftfy = ["ftfy"] -integrations = ["optuna", "ray[tune]", "sigopt"] -ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"] -modelcreation = ["cookiecutter (==1.7.3)"] -natten = ["natten (>=0.14.6)"] -onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"] -onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"] -optuna = ["optuna"] -quality = ["GitPython (<3.1.19)", "black (>=23.1,<24.0)", "datasets (!=2.5.0)", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (>=0.0.241,<=0.0.259)", "urllib3 (<2.0.0)"] -ray = ["ray[tune]"] -retrieval = ["datasets (!=2.5.0)", "faiss-cpu"] -sagemaker = ["sagemaker (>=2.31.0)"] -sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"] -serving = ["fastapi", "pydantic (<2)", "starlette", "uvicorn"] -sigopt = ["sigopt"] -sklearn = ["scikit-learn"] -speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] -testing = ["GitPython (<3.1.19)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "parameterized", "protobuf", "psutil", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "timeout-decorator"] -tf = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx"] -tf-cpu = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx"] -tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] -timm = ["timm"] -tokenizers = ["tokenizers (>=0.11.1,!=0.11.3,<0.14)"] -torch = ["accelerate (>=0.20.3)", "torch (>=1.10,!=1.12.0)"] -torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] -torch-vision = ["Pillow (<10.0.0)", "torchvision"] -torchhub = ["filelock", "huggingface-hub (>=0.15.1,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.11.1,!=0.11.3,<0.14)", "torch (>=1.10,!=1.12.0)", "tqdm (>=4.27)"] -video = ["av (==9.2.0)", "decord (==0.6.0)"] -vision = ["Pillow (<10.0.0)"] - [[package]] name = "typer" version = "0.9.0" @@ -4491,6 +3753,31 @@ files = [ {file = "types_PyYAML-6.0.12.11-py3-none-any.whl", hash = "sha256:a461508f3096d1d5810ec5ab95d7eeecb651f3a15b71959999988942063bf01d"}, ] +[[package]] +name = "types-requests" +version = "2.31.0.2" +description = "Typing stubs for requests" +optional = false +python-versions = "*" +files = [ + {file = "types-requests-2.31.0.2.tar.gz", hash = "sha256:6aa3f7faf0ea52d728bb18c0a0d1522d9bfd8c72d26ff6f61bfc3d06a411cf40"}, + {file = "types_requests-2.31.0.2-py3-none-any.whl", hash = "sha256:56d181c85b5925cbc59f4489a57e72a8b2166f18273fd8ba7b6fe0c0b986f12a"}, +] + +[package.dependencies] +types-urllib3 = "*" + +[[package]] +name = "types-urllib3" +version = "1.26.25.14" +description = "Typing stubs for urllib3" +optional = false +python-versions = "*" +files = [ + {file = "types-urllib3-1.26.25.14.tar.gz", hash = "sha256:229b7f577c951b8c1b92c1bc2b2fdb0b49847bd2af6d1cc2a2e3dd340f3bda8f"}, + {file = "types_urllib3-1.26.25.14-py3-none-any.whl", hash = "sha256:9683bbb7fb72e32bfe9d2be6e04875fbe1b3eeec3cbb4ea231435aa7fd6b4f0e"}, +] + [[package]] name = "typing-extensions" version = "4.7.1" @@ -4517,17 +3804,6 @@ files = [ mypy-extensions = ">=0.3.0" typing-extensions = ">=3.7.4" -[[package]] -name = "tzdata" -version = "2023.3" -description = "Provider of IANA time zone data" -optional = true -python-versions = ">=2" -files = [ - {file = "tzdata-2023.3-py2.py3-none-any.whl", hash = "sha256:7e65763eef3120314099b6939b5546db7adce1e7d6f2e179e3df563c70511eda"}, - {file = "tzdata-2023.3.tar.gz", hash = "sha256:11ef1e08e54acb0d4f95bdb1be05da659673de4acbd21bf9c69e94cc5e907a3a"}, -] - [[package]] name = "uri-template" version = "1.3.0" @@ -4559,33 +3835,6 @@ secure = ["certifi", "cryptography (>=1.9)", "idna (>=2.0.0)", "pyopenssl (>=17. socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] -[[package]] -name = "vowpal-wabbit-next" -version = "0.6.0" -description = "Experimental python bindings for VowpalWabbit" -optional = true -python-versions = ">=3.7" -files = [ - {file = "vowpal-wabbit-next-0.6.0.tar.gz", hash = "sha256:f0381614d99fac6a0f52e995ee0bfc7b681054f397bea7ff08b8a523d5315a54"}, - {file = "vowpal_wabbit_next-0.6.0-cp310-cp310-macosx_10_13_universal2.whl", hash = "sha256:cfbb831cfe9eb81185aff7cdca437ae17c6d9aca8d74e26c326e3ef4ee8e81e7"}, - {file = "vowpal_wabbit_next-0.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9d31829778f9c600f5c121f614516ca1bc9ede5d1bc77b1eb3b59b32d9138db9"}, - {file = "vowpal_wabbit_next-0.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:714347606ab302a2f72870b6ae6dce58de4bec1b489f4bd65d80a8e326e1db8a"}, - {file = "vowpal_wabbit_next-0.6.0-cp311-cp311-macosx_10_13_universal2.whl", hash = "sha256:3a8482d5c0b9357fdb36b62d659e6b74e93aeab165b910292572a98e91d7a014"}, - {file = "vowpal_wabbit_next-0.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1e4349099b938102f51fb6fedf035bc1deacb2971cd2a48641ca7d45186efda0"}, - {file = "vowpal_wabbit_next-0.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:c8f58cdc49f270b1bed6f0fdd7520c8ba1b328de5cd8a2760c0ec70a630de92e"}, - {file = "vowpal_wabbit_next-0.6.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8b7052ce7212fd1cae8ffd966e240c814f3c1df08fd612437d48f0f23e7694c"}, - {file = "vowpal_wabbit_next-0.6.0-cp37-cp37m-win_amd64.whl", hash = "sha256:d24d9c380d0e9b41151337c7f9e2a33ec5bfd738fdee9f65c1a40e486234aca3"}, - {file = "vowpal_wabbit_next-0.6.0-cp38-cp38-macosx_10_13_universal2.whl", hash = "sha256:0d77a8c55249ec9a7f404939ecc6948db0527e522e8a7ae149ec7cd29b3ade04"}, - {file = "vowpal_wabbit_next-0.6.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa2f52f1267fbc26c7757335f9c76a0f00b112971e04c85b8a9bc9e82300597"}, - {file = "vowpal_wabbit_next-0.6.0-cp38-cp38-win_amd64.whl", hash = "sha256:5d04f91200ecae73196d9f5601853d63afce8c1c8a0d310a608e8ddfa3b190cb"}, - {file = "vowpal_wabbit_next-0.6.0-cp39-cp39-macosx_10_13_universal2.whl", hash = "sha256:2df4a652729c0db34afd8fb4fc49b0090d6f061e2d49899e5f092fd4c3d23253"}, - {file = "vowpal_wabbit_next-0.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c289a260ab759f04903b441701cff66ea74d6c061d966caaba0c65ac12d05528"}, - {file = "vowpal_wabbit_next-0.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:8d022cab07274f227df159a81bccf034def7dd54ad70392ee98743ffa4953072"}, -] - -[package.dependencies] -numpy = "*" - [[package]] name = "wasabi" version = "1.1.2" @@ -4639,13 +3888,13 @@ files = [ [[package]] name = "websocket-client" -version = "1.6.3" +version = "1.6.2" description = "WebSocket client for Python with low level API options" optional = false python-versions = ">=3.8" files = [ - {file = "websocket-client-1.6.3.tar.gz", hash = "sha256:3aad25d31284266bcfcfd1fd8a743f63282305a364b8d0948a43bd606acc652f"}, - {file = "websocket_client-1.6.3-py3-none-any.whl", hash = "sha256:6cfc30d051ebabb73a5fa246efdcc14c8fbebbd0330f8984ac3bb6d9edd2ad03"}, + {file = "websocket-client-1.6.2.tar.gz", hash = "sha256:53e95c826bf800c4c465f50093a8c4ff091c7327023b10bfaff40cf1ef170eaa"}, + {file = "websocket_client-1.6.2-py3-none-any.whl", hash = "sha256:ce54f419dfae71f4bdba69ebe65bf7f0a93fe71bc009ad3a010aacc3eebad537"}, ] [package.extras] @@ -4767,9 +4016,9 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.link testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] [extras] -extended-testing = ["faker", "presidio-analyzer", "presidio-anonymizer", "sentence-transformers", "vowpal-wabbit-next"] +extended-testing = ["faker", "presidio-analyzer", "presidio-anonymizer"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "a392728e7880f0fc679885888dbc69838f6de94607803fec40b4640ae63d02d8" +content-hash = "443e88f690572715cf58671e4480a006574c7141a1258dff0a0818b954184901" diff --git a/libs/experimental/pyproject.toml b/libs/experimental/pyproject.toml index 0b7124e460..e0fed35bc9 100644 --- a/libs/experimental/pyproject.toml +++ b/libs/experimental/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langchain-experimental" -version = "0.0.14" +version = "0.0.16" description = "Building applications with LLMs through composability" authors = [] license = "MIT" @@ -26,6 +26,7 @@ black = "^23.1.0" [tool.poetry.group.typing.dependencies] mypy = "^0.991" types-pyyaml = "^6.0.12.2" +types-requests = "^2.28.11.5" [tool.poetry.group.dev.dependencies] jupyter = "^1.0.0" diff --git a/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py new file mode 100644 index 0000000000..9484a0e9dc --- /dev/null +++ b/libs/experimental/tests/unit_tests/test_reversible_data_anonymizer.py @@ -0,0 +1,154 @@ +import os +from typing import Iterator, List + +import pytest + + +@pytest.fixture(scope="module", autouse=True) +def check_spacy_model() -> Iterator[None]: + import spacy + + if not spacy.util.is_package("en_core_web_lg"): + pytest.skip(reason="Spacy model 'en_core_web_lg' not installed") + yield + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +@pytest.mark.parametrize( + "analyzed_fields,should_contain", + [(["PERSON"], False), (["PHONE_NUMBER"], True), (None, False)], +) +def test_anonymize(analyzed_fields: List[str], should_contain: bool) -> None: + """Test anonymizing a name in a simple sentence""" + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + text = "Hello, my name is John Doe." + anonymizer = PresidioReversibleAnonymizer(analyzed_fields=analyzed_fields) + anonymized_text = anonymizer.anonymize(text) + assert ("John Doe" in anonymized_text) == should_contain + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_anonymize_multiple() -> None: + """Test anonymizing multiple items in a sentence""" + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + text = "John Smith's phone number is 313-666-7440 and email is johnsmith@gmail.com" + anonymizer = PresidioReversibleAnonymizer() + anonymized_text = anonymizer.anonymize(text) + for phrase in ["John Smith", "313-666-7440", "johnsmith@gmail.com"]: + assert phrase not in anonymized_text + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_anonymize_with_custom_operator() -> None: + """Test anonymize a name with a custom operator""" + from presidio_anonymizer.entities import OperatorConfig + + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": ""})} + anonymizer = PresidioReversibleAnonymizer(operators=custom_operator) + + text = "Jane Doe was here." + + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text == " was here." + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_add_recognizer_operator() -> None: + """ + Test add recognizer and anonymize a new type of entity and with a custom operator + """ + from presidio_analyzer import PatternRecognizer + from presidio_anonymizer.entities import OperatorConfig + + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + anonymizer = PresidioReversibleAnonymizer(analyzed_fields=[]) + titles_list = ["Sir", "Madam", "Professor"] + custom_recognizer = PatternRecognizer( + supported_entity="TITLE", deny_list=titles_list + ) + anonymizer.add_recognizer(custom_recognizer) + + # anonymizing with custom recognizer + text = "Madam Jane Doe was here." + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text == " Jane Doe was here." + + # anonymizing with custom recognizer and operator + custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})} + anonymizer.add_operators(custom_operator) + anonymized_text = anonymizer.anonymize(text) + assert anonymized_text == "Dear Jane Doe was here." + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_deanonymizer_mapping() -> None: + """Test if deanonymizer mapping is correctly populated""" + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + anonymizer = PresidioReversibleAnonymizer( + analyzed_fields=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD"] + ) + + anonymizer.anonymize("Hello, my name is John Doe and my number is 444 555 6666.") + + # ["PERSON", "PHONE_NUMBER"] + assert len(anonymizer.deanonymizer_mapping.keys()) == 2 + assert "John Doe" in anonymizer.deanonymizer_mapping.get("PERSON", {}).values() + assert ( + "444 555 6666" + in anonymizer.deanonymizer_mapping.get("PHONE_NUMBER", {}).values() + ) + + text_to_anonymize = ( + "And my name is Jane Doe, my email is jane@gmail.com and " + "my credit card is 4929 5319 6292 5362." + ) + anonymizer.anonymize(text_to_anonymize) + + # ["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD"] + assert len(anonymizer.deanonymizer_mapping.keys()) == 4 + assert "Jane Doe" in anonymizer.deanonymizer_mapping.get("PERSON", {}).values() + assert ( + "jane@gmail.com" + in anonymizer.deanonymizer_mapping.get("EMAIL_ADDRESS", {}).values() + ) + assert ( + "4929 5319 6292 5362" + in anonymizer.deanonymizer_mapping.get("CREDIT_CARD", {}).values() + ) + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_deanonymize() -> None: + """Test deanonymizing a name in a simple sentence""" + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + text = "Hello, my name is John Doe." + anonymizer = PresidioReversibleAnonymizer(analyzed_fields=["PERSON"]) + anonymized_text = anonymizer.anonymize(text) + deanonymized_text = anonymizer.deanonymize(anonymized_text) + assert deanonymized_text == text + + +@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker") +def test_save_load_deanonymizer_mapping() -> None: + from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer + + anonymizer = PresidioReversibleAnonymizer(analyzed_fields=["PERSON"]) + anonymizer.anonymize("Hello, my name is John Doe.") + try: + anonymizer.save_deanonymizer_mapping("test_file.json") + assert os.path.isfile("test_file.json") + + anonymizer = PresidioReversibleAnonymizer() + anonymizer.load_deanonymizer_mapping("test_file.json") + + assert "John Doe" in anonymizer.deanonymizer_mapping.get("PERSON", {}).values() + + finally: + os.remove("test_file.json") diff --git a/libs/langchain/langchain/agents/agent.py b/libs/langchain/langchain/agents/agent.py index bc266d7c1e..2912cc57fd 100644 --- a/libs/langchain/langchain/agents/agent.py +++ b/libs/langchain/langchain/agents/agent.py @@ -7,7 +7,16 @@ import logging import time from abc import abstractmethod from pathlib import Path -from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Sequence, + Tuple, + Union, +) import yaml @@ -36,6 +45,7 @@ from langchain.schema import ( ) from langchain.schema.language_model import BaseLanguageModel from langchain.schema.messages import BaseMessage +from langchain.schema.runnable import Runnable from langchain.tools.base import BaseTool from langchain.utilities.asyncio import asyncio_timeout from langchain.utils.input import get_color_mapping @@ -307,6 +317,71 @@ class AgentOutputParser(BaseOutputParser): """Parse text into agent action/finish.""" +class RunnableAgent(BaseSingleActionAgent): + """Agent powered by runnables.""" + + runnable: Runnable[dict, Union[AgentAction, AgentFinish]] + """Runnable to call to get agent action.""" + _input_keys: List[str] = [] + """Input keys.""" + + class Config: + """Configuration for this pydantic object.""" + + arbitrary_types_allowed = True + + @property + def input_keys(self) -> List[str]: + """Return the input keys. + + Returns: + List of input keys. + """ + return self._input_keys + + def plan( + self, + intermediate_steps: List[Tuple[AgentAction, str]], + callbacks: Callbacks = None, + **kwargs: Any, + ) -> Union[AgentAction, AgentFinish]: + """Given input, decided what to do. + + Args: + intermediate_steps: Steps the LLM has taken to date, + along with the observations. + callbacks: Callbacks to run. + **kwargs: User inputs. + + Returns: + Action specifying what tool to use. + """ + inputs = {**kwargs, **{"intermediate_steps": intermediate_steps}} + output = self.runnable.invoke(inputs, config={"callbacks": callbacks}) + return output + + async def aplan( + self, + intermediate_steps: List[Tuple[AgentAction, str]], + callbacks: Callbacks = None, + **kwargs: Any, + ) -> Union[AgentAction, AgentFinish]: + """Given input, decided what to do. + + Args: + intermediate_steps: Steps the LLM has taken to date, + along with observations + callbacks: Callbacks to run. + **kwargs: User inputs. + + Returns: + Action specifying what tool to use. + """ + inputs = {**kwargs, **{"intermediate_steps": intermediate_steps}} + output = await self.runnable.ainvoke(inputs, config={"callbacks": callbacks}) + return output + + class LLMSingleActionAgent(BaseSingleActionAgent): """Base class for single action agents.""" @@ -725,6 +800,14 @@ s ) return values + @root_validator(pre=True) + def validate_runnable_agent(cls, values: Dict) -> Dict: + """Convert runnable to agent if passed in.""" + agent = values["agent"] + if isinstance(agent, Runnable): + values["agent"] = RunnableAgent(runnable=agent) + return values + def save(self, file_path: Union[Path, str]) -> None: """Raise error - saving not supported for Agent Executors.""" raise ValueError( diff --git a/libs/langchain/langchain/callbacks/confident_callback.py b/libs/langchain/langchain/callbacks/confident_callback.py new file mode 100644 index 0000000000..d65ad8a0a2 --- /dev/null +++ b/libs/langchain/langchain/callbacks/confident_callback.py @@ -0,0 +1,188 @@ +# flake8: noqa +import os +import warnings +from typing import Any, Dict, List, Optional, Union + +from langchain.callbacks.base import BaseCallbackHandler +from langchain.schema import AgentAction, AgentFinish, LLMResult + + +class DeepEvalCallbackHandler(BaseCallbackHandler): + """Callback Handler that logs into deepeval. + + Args: + implementation_name: name of the `implementation` in deepeval + metrics: A list of metrics + + Raises: + ImportError: if the `deepeval` package is not installed. + + Examples: + >>> from langchain.llms import OpenAI + >>> from langchain.callbacks import DeepEvalCallbackHandler + >>> from deepeval.metrics import AnswerRelevancy + >>> metric = AnswerRelevancy(minimum_score=0.3) + >>> deepeval_callback = DeepEvalCallbackHandler( + ... implementation_name="exampleImplementation", + ... metrics=[metric], + ... ) + >>> llm = OpenAI( + ... temperature=0, + ... callbacks=[deepeval_callback], + ... verbose=True, + ... openai_api_key="API_KEY_HERE", + ... ) + >>> llm.generate([ + ... "What is the best evaluation tool out there? (no bias at all)", + ... ]) + "Deepeval, no doubt about it." + """ + + REPO_URL: str = "https://github.com/confident-ai/deepeval" + ISSUES_URL: str = f"{REPO_URL}/issues" + BLOG_URL: str = "https://docs.confident-ai.com" # noqa: E501 + + def __init__( + self, + metrics: List[Any], + implementation_name: Optional[str] = None, + ) -> None: + """Initializes the `deepevalCallbackHandler`. + + Args: + implementation_name: Name of the implementation you want. + metrics: What metrics do you want to track? + + Raises: + ImportError: if the `deepeval` package is not installed. + ConnectionError: if the connection to deepeval fails. + """ + + super().__init__() + + # Import deepeval (not via `import_deepeval` to keep hints in IDEs) + try: + import deepeval # ignore: F401,I001 + except ImportError: + raise ImportError( + """To use the deepeval callback manager you need to have the + `deepeval` Python package installed. Please install it with + `pip install deepeval`""" + ) + + if os.path.exists(".deepeval"): + warnings.warn( + """You are currently not logging anything to the dashboard, we + recommend using `deepeval login`.""" + ) + + # Set the deepeval variables + self.implementation_name = implementation_name + self.metrics = metrics + + warnings.warn( + ( + "The `DeepEvalCallbackHandler` is currently in beta and is subject to" + " change based on updates to `langchain`. Please report any issues to" + f" {self.ISSUES_URL} as an `integration` issue." + ), + ) + + def on_llm_start( + self, serialized: Dict[str, Any], prompts: List[str], **kwargs: Any + ) -> None: + """Store the prompts""" + self.prompts = prompts + + def on_llm_new_token(self, token: str, **kwargs: Any) -> None: + """Do nothing when a new token is generated.""" + pass + + def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None: + """Log records to deepeval when an LLM ends.""" + from deepeval.metrics.answer_relevancy import AnswerRelevancy + from deepeval.metrics.bias_classifier import UnBiasedMetric + from deepeval.metrics.metric import Metric + from deepeval.metrics.toxic_classifier import NonToxicMetric + + for metric in self.metrics: + for i, generation in enumerate(response.generations): + # Here, we only measure the first generation's output + output = generation[0].text + query = self.prompts[i] + if isinstance(metric, AnswerRelevancy): + result = metric.measure( + output=output, + query=query, + ) + print(f"Answer Relevancy: {result}") + elif isinstance(metric, UnBiasedMetric): + score = metric.measure(output) + print(f"Bias Score: {score}") + elif isinstance(metric, NonToxicMetric): + score = metric.measure(output) + print(f"Toxic Score: {score}") + else: + raise ValueError( + f"""Metric {metric.__name__} is not supported by deepeval + callbacks.""" + ) + + def on_llm_error( + self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any + ) -> None: + """Do nothing when LLM outputs an error.""" + pass + + def on_chain_start( + self, serialized: Dict[str, Any], inputs: Dict[str, Any], **kwargs: Any + ) -> None: + """Do nothing when chain starts""" + pass + + def on_chain_end(self, outputs: Dict[str, Any], **kwargs: Any) -> None: + """Do nothing when chain ends.""" + pass + + def on_chain_error( + self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any + ) -> None: + """Do nothing when LLM chain outputs an error.""" + pass + + def on_tool_start( + self, + serialized: Dict[str, Any], + input_str: str, + **kwargs: Any, + ) -> None: + """Do nothing when tool starts.""" + pass + + def on_agent_action(self, action: AgentAction, **kwargs: Any) -> Any: + """Do nothing when agent takes a specific action.""" + pass + + def on_tool_end( + self, + output: str, + observation_prefix: Optional[str] = None, + llm_prefix: Optional[str] = None, + **kwargs: Any, + ) -> None: + """Do nothing when tool ends.""" + pass + + def on_tool_error( + self, error: Union[Exception, KeyboardInterrupt], **kwargs: Any + ) -> None: + """Do nothing when tool outputs an error.""" + pass + + def on_text(self, text: str, **kwargs: Any) -> None: + """Do nothing""" + pass + + def on_agent_finish(self, finish: AgentFinish, **kwargs: Any) -> None: + """Do nothing""" + pass diff --git a/libs/langchain/langchain/callbacks/llmonitor_callback.py b/libs/langchain/langchain/callbacks/llmonitor_callback.py index 140ccaed40..9aadc9df71 100644 --- a/libs/langchain/langchain/callbacks/llmonitor_callback.py +++ b/libs/langchain/langchain/callbacks/llmonitor_callback.py @@ -14,6 +14,70 @@ from langchain.schema.output import LLMResult DEFAULT_API_URL = "https://app.llmonitor.com" +def _serialize(obj: Any) -> Union[Dict[str, Any], List[Any], Any]: + if hasattr(obj, "to_json"): + return obj.to_json() + + if isinstance(obj, dict): + return {key: _serialize(value) for key, value in obj.items()} + + if isinstance(obj, list): + return [_serialize(element) for element in obj] + + return obj + + +def _parse_input(raw_input: Any) -> Any: + if not raw_input: + return None + + if not isinstance(raw_input, dict): + return _serialize(raw_input) + + input_value = raw_input.get("input") + inputs_value = raw_input.get("inputs") + question_value = raw_input.get("question") + query_value = raw_input.get("query") + + if input_value: + return input_value + if inputs_value: + return inputs_value + if question_value: + return question_value + if query_value: + return query_value + + return _serialize(raw_input) + + +def _parse_output(raw_output: dict) -> Any: + if not raw_output: + return None + + if not isinstance(raw_output, dict): + return _serialize(raw_output) + + text_value = raw_output.get("text") + output_value = raw_output.get("output") + output_text_value = raw_output.get("output_text") + answer_value = raw_output.get("answer") + result_value = raw_output.get("result") + + if text_value: + return text_value + if answer_value: + return answer_value + if output_value: + return output_value + if output_text_value: + return output_text_value + if result_value: + return result_value + + return _serialize(raw_output) + + def _parse_lc_role( role: str, ) -> Union[Literal["user", "ai", "system", "function"], None]: @@ -29,8 +93,27 @@ def _parse_lc_role( return None -def _serialize_lc_message(message: BaseMessage) -> Dict[str, Any]: - return {"text": message.content, "role": _parse_lc_role(message.type)} +def _get_user_id(metadata: Any) -> Any: + metadata = metadata or {} + user_id = metadata.get("user_id") + if user_id is None: + user_id = metadata.get("userId") + return user_id + + +def _parse_lc_message(message: BaseMessage) -> Dict[str, Any]: + parsed = {"text": message.content, "role": _parse_lc_role(message.type)} + + function_call = (message.additional_kwargs or {}).get("function_call") + + if function_call is not None: + parsed["functionCall"] = function_call + + return parsed + + +def _parse_lc_messages(messages: Union[List[BaseMessage], Any]) -> List[Dict[str, Any]]: + return [_parse_lc_message(message) for message in messages] class LLMonitorCallbackHandler(BaseCallbackHandler): @@ -62,14 +145,20 @@ class LLMonitorCallbackHandler(BaseCallbackHandler): __api_url: str __app_id: str + __verbose: bool def __init__( - self, app_id: Union[str, None] = None, api_url: Union[str, None] = None + self, + app_id: Union[str, None] = None, + api_url: Union[str, None] = None, + verbose: bool = False, ) -> None: super().__init__() self.__api_url = api_url or os.getenv("LLMONITOR_API_URL") or DEFAULT_API_URL + self.__verbose = verbose or bool(os.getenv("LLMONITOR_VERBOSE")) + _app_id = app_id or os.getenv("LLMONITOR_APP_ID") if _app_id is None: raise ValueError( @@ -89,7 +178,12 @@ class LLMonitorCallbackHandler(BaseCallbackHandler): def __send_event(self, event: Dict[str, Any]) -> None: headers = {"Content-Type": "application/json"} + event = {**event, "app": self.__app_id, "timestamp": str(datetime.utcnow())} + + if self.__verbose: + print("llmonitor_callback", event) + data = {"events": event} requests.post(headers=headers, url=f"{self.__api_url}/api/report", json=data) @@ -110,7 +204,7 @@ class LLMonitorCallbackHandler(BaseCallbackHandler): "userId": (metadata or {}).get("userId"), "runId": str(run_id), "parentRunId": str(parent_run_id) if parent_run_id else None, - "input": prompts[0], + "input": _parse_input(prompts), "name": kwargs.get("invocation_params", {}).get("model_name"), "tags": tags, "metadata": metadata, @@ -128,13 +222,15 @@ class LLMonitorCallbackHandler(BaseCallbackHandler): metadata: Union[Dict[str, Any], None] = None, **kwargs: Any, ) -> Any: + user_id = _get_user_id(metadata) + event = { "event": "start", "type": "llm", - "userId": (metadata or {}).get("userId"), + "userId": user_id, "runId": str(run_id), "parentRunId": str(parent_run_id) if parent_run_id else None, - "input": [_serialize_lc_message(message[0]) for message in messages], + "input": _parse_lc_messages(messages[0]), "name": kwargs.get("invocation_params", {}).get("model_name"), "tags": tags, "metadata": metadata, @@ -151,36 +247,26 @@ class LLMonitorCallbackHandler(BaseCallbackHandler): ) -> None: token_usage = (response.llm_output or {}).get("token_usage", {}) + parsed_output = _parse_lc_messages( + map( + lambda o: o.message if hasattr(o, "message") else None, + response.generations[0], + ) + ) + event = { "event": "end", "type": "llm", "runId": str(run_id), "parent_run_id": str(parent_run_id) if parent_run_id else None, - "output": {"text": response.generations[0][0].text, "role": "ai"}, + "output": parsed_output, "tokensUsage": { - "prompt": token_usage.get("prompt_tokens", 0), - "completion": token_usage.get("completion_tokens", 0), + "prompt": token_usage.get("prompt_tokens"), + "completion": token_usage.get("completion_tokens"), }, } self.__send_event(event) - def on_llm_error( - self, - error: Union[Exception, KeyboardInterrupt], - *, - run_id: UUID, - parent_run_id: Union[UUID, None] = None, - **kwargs: Any, - ) -> Any: - event = { - "event": "error", - "type": "llm", - "runId": str(run_id), - "parent_run_id": str(parent_run_id) if parent_run_id else None, - "error": {"message": str(error), "stack": traceback.format_exc()}, - } - self.__send_event(event) - def on_tool_start( self, serialized: Dict[str, Any], @@ -192,10 +278,11 @@ class LLMonitorCallbackHandler(BaseCallbackHandler): metadata: Union[Dict[str, Any], None] = None, **kwargs: Any, ) -> None: + user_id = _get_user_id(metadata) event = { "event": "start", "type": "tool", - "userId": (metadata or {}).get("userId"), + "userId": user_id, "runId": str(run_id), "parentRunId": str(parent_run_id) if parent_run_id else None, "name": serialized.get("name"), @@ -236,25 +323,34 @@ class LLMonitorCallbackHandler(BaseCallbackHandler): ) -> Any: name = serialized.get("id", [None, None, None, None])[3] type = "chain" + metadata = metadata or {} + + agentName = metadata.get("agent_name") + if agentName is None: + agentName = metadata.get("agentName") - agentName = (metadata or {}).get("agentName") if agentName is not None: type = "agent" name = agentName if name == "AgentExecutor" or name == "PlanAndExecute": type = "agent" + + if parent_run_id is not None: + type = "chain" + + user_id = _get_user_id(metadata) + event = { "event": "start", "type": type, - "userId": (metadata or {}).get("userId"), + "userId": user_id, "runId": str(run_id), "parentRunId": str(parent_run_id) if parent_run_id else None, - "input": inputs.get("input", inputs), + "input": _parse_input(inputs), "tags": tags, "metadata": metadata, - "name": serialized.get("id", [None, None, None, None])[3], + "name": name, } - self.__send_event(event) def on_chain_end( @@ -269,7 +365,42 @@ class LLMonitorCallbackHandler(BaseCallbackHandler): "event": "end", "type": "chain", "runId": str(run_id), - "output": outputs.get("output", outputs), + "output": _parse_output(outputs), + } + self.__send_event(event) + + def on_agent_action( + self, + action: AgentAction, + *, + run_id: UUID, + parent_run_id: Union[UUID, None] = None, + **kwargs: Any, + ) -> Any: + event = { + "event": "start", + "type": "tool", + "runId": str(run_id), + "parentRunId": str(parent_run_id) if parent_run_id else None, + "name": action.tool, + "input": _parse_input(action.tool_input), + } + self.__send_event(event) + + def on_agent_finish( + self, + finish: AgentFinish, + *, + run_id: UUID, + parent_run_id: Union[UUID, None] = None, + **kwargs: Any, + ) -> Any: + event = { + "event": "end", + "type": "agent", + "runId": str(run_id), + "parentRunId": str(parent_run_id) if parent_run_id else None, + "output": _parse_output(finish.return_values), } self.__send_event(event) @@ -290,38 +421,37 @@ class LLMonitorCallbackHandler(BaseCallbackHandler): } self.__send_event(event) - def on_agent_action( + def on_tool_error( self, - action: AgentAction, + error: Union[Exception, KeyboardInterrupt], *, run_id: UUID, parent_run_id: Union[UUID, None] = None, **kwargs: Any, ) -> Any: event = { - "event": "start", + "event": "error", "type": "tool", "runId": str(run_id), - "parentRunId": str(parent_run_id) if parent_run_id else None, - "name": action.tool, - "input": action.tool_input, + "parent_run_id": str(parent_run_id) if parent_run_id else None, + "error": {"message": str(error), "stack": traceback.format_exc()}, } self.__send_event(event) - def on_agent_finish( + def on_llm_error( self, - finish: AgentFinish, + error: Union[Exception, KeyboardInterrupt], *, run_id: UUID, parent_run_id: Union[UUID, None] = None, **kwargs: Any, ) -> Any: event = { - "event": "end", - "type": "agent", + "event": "error", + "type": "llm", "runId": str(run_id), - "parentRunId": str(parent_run_id) if parent_run_id else None, - "output": finish.return_values, + "parent_run_id": str(parent_run_id) if parent_run_id else None, + "error": {"message": str(error), "stack": traceback.format_exc()}, } self.__send_event(event) diff --git a/libs/langchain/langchain/callbacks/tracers/evaluation.py b/libs/langchain/langchain/callbacks/tracers/evaluation.py index 5b178c84e6..1cf205e3d2 100644 --- a/libs/langchain/langchain/callbacks/tracers/evaluation.py +++ b/libs/langchain/langchain/callbacks/tracers/evaluation.py @@ -2,29 +2,20 @@ from __future__ import annotations import logging -from concurrent.futures import Future, ThreadPoolExecutor, wait +from concurrent.futures import Future, ThreadPoolExecutor from typing import Any, Dict, List, Optional, Sequence, Set, Union from uuid import UUID import langsmith from langsmith import schemas as langsmith_schemas -from langchain.callbacks.manager import tracing_v2_enabled +from langchain.callbacks import manager +from langchain.callbacks.tracers import langchain as langchain_tracer from langchain.callbacks.tracers.base import BaseTracer -from langchain.callbacks.tracers.langchain import _get_client from langchain.callbacks.tracers.schemas import Run logger = logging.getLogger(__name__) -_TRACERS: List[EvaluatorCallbackHandler] = [] - - -def wait_for_all_evaluators() -> None: - """Wait for all tracers to finish.""" - global _TRACERS - for tracer in _TRACERS: - tracer.wait_for_futures() - class EvaluatorCallbackHandler(BaseTracer): """A tracer that runs a run evaluator whenever a run is persisted. @@ -79,17 +70,13 @@ class EvaluatorCallbackHandler(BaseTracer): self.example_id = ( UUID(example_id) if isinstance(example_id, str) else example_id ) - self.client = client or _get_client() + self.client = client or langchain_tracer.get_client() self.evaluators = evaluators - self.executor = ThreadPoolExecutor( - max_workers=max(max_workers or len(evaluators), 1) - ) + self.max_workers = max_workers or len(evaluators) self.futures: Set[Future] = set() self.skip_unfinished = skip_unfinished self.project_name = project_name self.logged_feedback: Dict[str, List[langsmith_schemas.Feedback]] = {} - global _TRACERS - _TRACERS.append(self) def _evaluate_in_project(self, run: Run, evaluator: langsmith.RunEvaluator) -> None: """Evaluate the run in the project. @@ -105,7 +92,7 @@ class EvaluatorCallbackHandler(BaseTracer): try: if self.project_name is None: feedback = self.client.evaluate_run(run, evaluator) - with tracing_v2_enabled( + with manager.tracing_v2_enabled( project_name=self.project_name, tags=["eval"], client=self.client ): feedback = self.client.evaluate_run(run, evaluator) @@ -133,14 +120,15 @@ class EvaluatorCallbackHandler(BaseTracer): return run_ = run.copy() run_.reference_example_id = self.example_id - for evaluator in self.evaluators: - self.futures.add( - self.executor.submit(self._evaluate_in_project, run_, evaluator) - ) - - def wait_for_futures(self) -> None: - """Wait for all futures to complete.""" - futures = list(self.futures) - wait(futures) - for future in futures: - self.futures.remove(future) + if self.max_workers > 0: + with ThreadPoolExecutor(max_workers=self.max_workers) as executor: + list( + executor.map( + self._evaluate_in_project, + [run_ for _ in range(len(self.evaluators))], + self.evaluators, + ) + ) + else: + for evaluator in self.evaluators: + self._evaluate_in_project(run_, evaluator) diff --git a/libs/langchain/langchain/callbacks/tracers/langchain.py b/libs/langchain/langchain/callbacks/tracers/langchain.py index 0f57697721..0e6393c78b 100644 --- a/libs/langchain/langchain/callbacks/tracers/langchain.py +++ b/libs/langchain/langchain/callbacks/tracers/langchain.py @@ -42,7 +42,7 @@ def wait_for_all_tracers() -> None: tracer.wait_for_futures() -def _get_client() -> Client: +def get_client() -> Client: """Get the client.""" global _CLIENT if _CLIENT is None: @@ -83,7 +83,7 @@ class LangChainTracer(BaseTracer): _EXECUTORS.append(self.executor) else: self.executor = None - self.client = client or _get_client() + self.client = client or get_client() self._futures: Set[Future] = set() self.tags = tags or [] global _TRACERS diff --git a/libs/langchain/langchain/chains/graph_qa/sparql.py b/libs/langchain/langchain/chains/graph_qa/sparql.py index eb8a365d76..2e1c017748 100644 --- a/libs/langchain/langchain/chains/graph_qa/sparql.py +++ b/libs/langchain/langchain/chains/graph_qa/sparql.py @@ -84,17 +84,17 @@ class GraphSparqlQAChain(Chain): _intent = self.sparql_intent_chain.run({"prompt": prompt}, callbacks=callbacks) intent = _intent.strip() - if "SELECT" not in intent and "UPDATE" not in intent: - raise ValueError( - "I am sorry, but this prompt seems to fit none of the currently " - "supported SPARQL query types, i.e., SELECT and UPDATE." - ) - elif intent.find("SELECT") < intent.find("UPDATE"): + if "SELECT" in intent and "UPDATE" not in intent: sparql_generation_chain = self.sparql_generation_select_chain intent = "SELECT" - else: + elif "UPDATE" in intent and "SELECT" not in intent: sparql_generation_chain = self.sparql_generation_update_chain intent = "UPDATE" + else: + raise ValueError( + "I am sorry, but this prompt seems to fit none of the currently " + "supported SPARQL query types, i.e., SELECT and UPDATE." + ) _run_manager.on_text("Identified intent:", end="\n", verbose=self.verbose) _run_manager.on_text(intent, color="green", end="\n", verbose=self.verbose) diff --git a/libs/langchain/langchain/chains/loading.py b/libs/langchain/langchain/chains/loading.py index c2e0b81397..9543f62988 100644 --- a/libs/langchain/langchain/chains/loading.py +++ b/libs/langchain/langchain/chains/loading.py @@ -20,6 +20,7 @@ from langchain.chains.llm_checker.base import LLMCheckerChain from langchain.chains.llm_math.base import LLMMathChain from langchain.chains.llm_requests import LLMRequestsChain from langchain.chains.qa_with_sources.base import QAWithSourcesChain +from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain from langchain.chains.qa_with_sources.vector_db import VectorDBQAWithSourcesChain from langchain.chains.retrieval_qa.base import RetrievalQA, VectorDBQA from langchain.llms.loading import load_llm, load_llm_from_config @@ -424,6 +425,30 @@ def _load_retrieval_qa(config: dict, **kwargs: Any) -> RetrievalQA: ) +def _load_retrieval_qa_with_sources_chain( + config: dict, **kwargs: Any +) -> RetrievalQAWithSourcesChain: + if "retriever" in kwargs: + retriever = kwargs.pop("retriever") + else: + raise ValueError("`retriever` must be present.") + if "combine_documents_chain" in config: + combine_documents_chain_config = config.pop("combine_documents_chain") + combine_documents_chain = load_chain_from_config(combine_documents_chain_config) + elif "combine_documents_chain_path" in config: + combine_documents_chain = load_chain(config.pop("combine_documents_chain_path")) + else: + raise ValueError( + "One of `combine_documents_chain` or " + "`combine_documents_chain_path` must be present." + ) + return RetrievalQAWithSourcesChain( + combine_documents_chain=combine_documents_chain, + retriever=retriever, + **config, + ) + + def _load_vector_db_qa(config: dict, **kwargs: Any) -> VectorDBQA: if "vectorstore" in kwargs: vectorstore = kwargs.pop("vectorstore") @@ -537,6 +562,7 @@ type_to_loader_dict = { "vector_db_qa_with_sources_chain": _load_vector_db_qa_with_sources_chain, "vector_db_qa": _load_vector_db_qa, "retrieval_qa": _load_retrieval_qa, + "retrieval_qa_with_sources_chain": _load_retrieval_qa_with_sources_chain, "graph_cypher_chain": _load_graph_cypher_chain, } diff --git a/libs/langchain/langchain/chains/qa_with_sources/retrieval.py b/libs/langchain/langchain/chains/qa_with_sources/retrieval.py index c5d587b464..80018950d9 100644 --- a/libs/langchain/langchain/chains/qa_with_sources/retrieval.py +++ b/libs/langchain/langchain/chains/qa_with_sources/retrieval.py @@ -60,3 +60,8 @@ class RetrievalQAWithSourcesChain(BaseQAWithSourcesChain): question, callbacks=run_manager.get_child() ) return self._reduce_tokens_below_limit(docs) + + @property + def _chain_type(self) -> str: + """Return the chain type.""" + return "retrieval_qa_with_sources_chain" diff --git a/libs/langchain/langchain/chat_loaders/__init__.py b/libs/langchain/langchain/chat_loaders/__init__.py index 594d87344d..7547ddcecc 100644 --- a/libs/langchain/langchain/chat_loaders/__init__.py +++ b/libs/langchain/langchain/chat_loaders/__init__.py @@ -1,6 +1,19 @@ -"""Load chat messages from common communications platforms for finetuning. +"""**Chat Loaders** load chat messages from common communications platforms. -This module provides functions to load chat messages from various +Load chat messages from various communications platforms such as Facebook Messenger, Telegram, and -WhatsApp. The loaded chat messages can be used for finetuning models. -""" +WhatsApp. The loaded chat messages can be used for fine-tuning models. + +**Class hierarchy:** + +.. code-block:: + + BaseChatLoader --> <name>ChatLoader # Examples: WhatsAppChatLoader, IMessageChatLoader + +**Main helpers:** + +.. code-block:: + + ChatSession + +""" # noqa: E501 diff --git a/libs/langchain/langchain/chat_loaders/base.py b/libs/langchain/langchain/chat_loaders/base.py index 418ba15d2f..6e1f37ca9a 100644 --- a/libs/langchain/langchain/chat_loaders/base.py +++ b/libs/langchain/langchain/chat_loaders/base.py @@ -1,10 +1,3 @@ -"""Base definitions for chat loaders. - -A chat loader is a class that loads chat messages from an external -source such as a file or a database. The chat messages can then be -used for finetuning. -""" - from abc import ABC, abstractmethod from typing import Iterator, List, Sequence, TypedDict @@ -12,7 +5,7 @@ from langchain.schema.messages import BaseMessage class ChatSession(TypedDict): - """A chat session represents a single + """Chat Session represents a single conversation, channel, or other group of messages.""" messages: Sequence[BaseMessage] diff --git a/libs/langchain/langchain/chat_loaders/facebook_messenger.py b/libs/langchain/langchain/chat_loaders/facebook_messenger.py index 5864c32740..bfdc0155c7 100644 --- a/libs/langchain/langchain/chat_loaders/facebook_messenger.py +++ b/libs/langchain/langchain/chat_loaders/facebook_messenger.py @@ -10,7 +10,7 @@ logger = logging.getLogger(__file__) class SingleFileFacebookMessengerChatLoader(BaseChatLoader): - """A chat loader for loading Facebook Messenger chat data from a single file. + """Load `Facebook Messenger` chat data from a single file. Args: path (Union[Path, str]): The path to the chat file. @@ -45,7 +45,7 @@ class SingleFileFacebookMessengerChatLoader(BaseChatLoader): class FolderFacebookMessengerChatLoader(BaseChatLoader): - """A chat loader for loading Facebook Messenger chat data from a folder. + """Load `Facebook Messenger` chat data from a folder. Args: path (Union[str, Path]): The path to the directory diff --git a/libs/langchain/langchain/chat_loaders/gmail.py b/libs/langchain/langchain/chat_loaders/gmail.py index 4e88accdee..94a3c5617e 100644 --- a/libs/langchain/langchain/chat_loaders/gmail.py +++ b/libs/langchain/langchain/chat_loaders/gmail.py @@ -62,7 +62,7 @@ def _get_message_data(service: Any, message: Any) -> ChatSession: class GMailLoader(BaseChatLoader): - """This loader goes over how to load data from GMail. + """Load data from `GMail`. There are many ways you could want to load data from GMail. This loader is currently fairly opinionated in how to do so. diff --git a/libs/langchain/langchain/chat_loaders/imessage.py b/libs/langchain/langchain/chat_loaders/imessage.py index ff9a06142c..eed0cfea37 100644 --- a/libs/langchain/langchain/chat_loaders/imessage.py +++ b/libs/langchain/langchain/chat_loaders/imessage.py @@ -1,27 +1,27 @@ -"""IMessage Chat Loader. - -This class is used to load chat sessions from the iMessage chat.db SQLite file. -It only works on macOS when you have iMessage enabled and have the chat.db file. - -The chat.db file is likely located at ~/Library/Messages/chat.db. However, your -terminal may not have permission to access this file. To resolve this, you can -copy the file to a different location, change the permissions of the file, or -grant full disk access for your terminal emulator in System Settings > Security -and Privacy > Full Disk Access. -""" from __future__ import annotations from pathlib import Path from typing import TYPE_CHECKING, Iterator, List, Optional, Union from langchain import schema -from langchain.chat_loaders import base as chat_loaders +from langchain.chat_loaders.base import BaseChatLoader, ChatSession if TYPE_CHECKING: import sqlite3 -class IMessageChatLoader(chat_loaders.BaseChatLoader): +class IMessageChatLoader(BaseChatLoader): + """Load chat sessions from the `iMessage` chat.db SQLite file. + + It only works on macOS when you have iMessage enabled and have the chat.db file. + + The chat.db file is likely located at ~/Library/Messages/chat.db. However, your + terminal may not have permission to access this file. To resolve this, you can + copy the file to a different location, change the permissions of the file, or + grant full disk access for your terminal emulator + in System Settings > Security and Privacy > Full Disk Access. + """ + def __init__(self, path: Optional[Union[str, Path]] = None): """ Initialize the IMessageChatLoader. @@ -46,7 +46,7 @@ class IMessageChatLoader(chat_loaders.BaseChatLoader): def _load_single_chat_session( self, cursor: "sqlite3.Cursor", chat_id: int - ) -> chat_loaders.ChatSession: + ) -> ChatSession: """ Load a single chat session from the iMessage chat.db. @@ -83,9 +83,9 @@ class IMessageChatLoader(chat_loaders.BaseChatLoader): ) ) - return chat_loaders.ChatSession(messages=results) + return ChatSession(messages=results) - def lazy_load(self) -> Iterator[chat_loaders.ChatSession]: + def lazy_load(self) -> Iterator[ChatSession]: """ Lazy load the chat sessions from the iMessage chat.db and yield them in the required format. diff --git a/libs/langchain/langchain/chat_loaders/slack.py b/libs/langchain/langchain/chat_loaders/slack.py index 261289bb43..7c9f76c965 100644 --- a/libs/langchain/langchain/chat_loaders/slack.py +++ b/libs/langchain/langchain/chat_loaders/slack.py @@ -6,12 +6,14 @@ from pathlib import Path from typing import Dict, Iterator, List, Union from langchain import schema -from langchain.chat_loaders import base as chat_loaders +from langchain.chat_loaders.base import BaseChatLoader, ChatSession logger = logging.getLogger(__name__) -class SlackChatLoader(chat_loaders.BaseChatLoader): +class SlackChatLoader(BaseChatLoader): + """Load `Slack` conversations from a dump zip file.""" + def __init__( self, path: Union[str, Path], @@ -25,9 +27,7 @@ class SlackChatLoader(chat_loaders.BaseChatLoader): if not self.zip_path.exists(): raise FileNotFoundError(f"File {self.zip_path} not found") - def _load_single_chat_session( - self, messages: List[Dict] - ) -> chat_loaders.ChatSession: + def _load_single_chat_session(self, messages: List[Dict]) -> ChatSession: results: List[Union[schema.AIMessage, schema.HumanMessage]] = [] previous_sender = None for message in messages: @@ -60,7 +60,7 @@ class SlackChatLoader(chat_loaders.BaseChatLoader): ) ) previous_sender = sender - return chat_loaders.ChatSession(messages=results) + return ChatSession(messages=results) def _read_json(self, zip_file: zipfile.ZipFile, file_path: str) -> List[dict]: """Read JSON data from a zip subfile.""" @@ -70,7 +70,7 @@ class SlackChatLoader(chat_loaders.BaseChatLoader): raise ValueError(f"Expected list of dictionaries, got {type(data)}") return data - def lazy_load(self) -> Iterator[chat_loaders.ChatSession]: + def lazy_load(self) -> Iterator[ChatSession]: """ Lazy load the chat sessions from the Slack dump file and yield them in the required format. diff --git a/libs/langchain/langchain/chat_loaders/telegram.py b/libs/langchain/langchain/chat_loaders/telegram.py index 786dad7278..12c30014ac 100644 --- a/libs/langchain/langchain/chat_loaders/telegram.py +++ b/libs/langchain/langchain/chat_loaders/telegram.py @@ -1,19 +1,19 @@ import json import logging import os +import tempfile import zipfile from pathlib import Path from typing import Iterator, List, Union from langchain import schema -from langchain.chat_loaders import base as chat_loaders +from langchain.chat_loaders.base import BaseChatLoader, ChatSession logger = logging.getLogger(__name__) -class TelegramChatLoader(chat_loaders.BaseChatLoader): - """A loading utility for converting telegram conversations - to LangChain chat messages. +class TelegramChatLoader(BaseChatLoader): + """Load `telegram` conversations to LangChain chat messages. To export, use the Telegram Desktop app from https://desktop.telegram.org/, select a conversation, click the three dots @@ -35,16 +35,14 @@ class TelegramChatLoader(chat_loaders.BaseChatLoader): """ self.path = path if isinstance(path, str) else str(path) - def _load_single_chat_session_html( - self, file_path: str - ) -> chat_loaders.ChatSession: + def _load_single_chat_session_html(self, file_path: str) -> ChatSession: """Load a single chat session from an HTML file. Args: file_path (str): Path to the HTML file. Returns: - chat_loaders.ChatSession: The loaded chat session. + ChatSession: The loaded chat session. """ try: from bs4 import BeautifulSoup @@ -81,18 +79,16 @@ class TelegramChatLoader(chat_loaders.BaseChatLoader): ) previous_sender = from_name - return chat_loaders.ChatSession(messages=results) + return ChatSession(messages=results) - def _load_single_chat_session_json( - self, file_path: str - ) -> chat_loaders.ChatSession: + def _load_single_chat_session_json(self, file_path: str) -> ChatSession: """Load a single chat session from a JSON file. Args: file_path (str): Path to the JSON file. Returns: - chat_loaders.ChatSession: The loaded chat session. + ChatSession: The loaded chat session. """ with open(file_path, "r", encoding="utf-8") as file: data = json.load(file) @@ -114,7 +110,7 @@ class TelegramChatLoader(chat_loaders.BaseChatLoader): ) ) - return chat_loaders.ChatSession(messages=results) + return ChatSession(messages=results) def _iterate_files(self, path: str) -> Iterator[str]: """Iterate over files in a directory or zip file. @@ -136,14 +132,15 @@ class TelegramChatLoader(chat_loaders.BaseChatLoader): with zipfile.ZipFile(path) as zip_file: for file in zip_file.namelist(): if file.endswith((".html", ".json")): - yield zip_file.extract(file) + with tempfile.TemporaryDirectory() as temp_dir: + yield zip_file.extract(file, path=temp_dir) - def lazy_load(self) -> Iterator[chat_loaders.ChatSession]: + def lazy_load(self) -> Iterator[ChatSession]: """Lazy load the messages from the chat file and yield them in as chat sessions. Yields: - chat_loaders.ChatSession: The loaded chat session. + ChatSession: The loaded chat session. """ for file_path in self._iterate_files(self.path): if file_path.endswith(".html"): diff --git a/libs/langchain/langchain/chat_loaders/whatsapp.py b/libs/langchain/langchain/chat_loaders/whatsapp.py index c911e262c6..39266485e2 100644 --- a/libs/langchain/langchain/chat_loaders/whatsapp.py +++ b/libs/langchain/langchain/chat_loaders/whatsapp.py @@ -5,13 +5,15 @@ import zipfile from typing import Iterator, List, Union from langchain import schema -from langchain.chat_loaders import base as chat_loaders +from langchain.chat_loaders.base import BaseChatLoader, ChatSession from langchain.schema import messages logger = logging.getLogger(__name__) -class WhatsAppChatLoader(chat_loaders.BaseChatLoader): +class WhatsAppChatLoader(BaseChatLoader): + """Load `WhatsApp` conversations from a dump zip file or directory.""" + def __init__(self, path: str): """Initialize the WhatsAppChatLoader. @@ -40,7 +42,7 @@ class WhatsAppChatLoader(chat_loaders.BaseChatLoader): flags=re.IGNORECASE, ) - def _load_single_chat_session(self, file_path: str) -> chat_loaders.ChatSession: + def _load_single_chat_session(self, file_path: str) -> ChatSession: """Load a single chat session from a file. Args: @@ -82,7 +84,7 @@ class WhatsAppChatLoader(chat_loaders.BaseChatLoader): ) else: logger.debug(f"Could not parse line: {line}") - return chat_loaders.ChatSession(messages=results) + return ChatSession(messages=results) def _iterate_files(self, path: str) -> Iterator[str]: """Iterate over the files in a directory or zip file. @@ -106,7 +108,7 @@ class WhatsAppChatLoader(chat_loaders.BaseChatLoader): if file.endswith(".txt"): yield zip_file.extract(file) - def lazy_load(self) -> Iterator[chat_loaders.ChatSession]: + def lazy_load(self) -> Iterator[ChatSession]: """Lazy load the messages from the chat file and yield them as chat sessions. diff --git a/libs/langchain/langchain/chat_models/__init__.py b/libs/langchain/langchain/chat_models/__init__.py index b03cb77710..07fe41d723 100644 --- a/libs/langchain/langchain/chat_models/__init__.py +++ b/libs/langchain/langchain/chat_models/__init__.py @@ -20,12 +20,12 @@ an interface where "chat messages" are the inputs and outputs. from langchain.chat_models.anthropic import ChatAnthropic from langchain.chat_models.anyscale import ChatAnyscale from langchain.chat_models.azure_openai import AzureChatOpenAI -from langchain.chat_models.bedrock import BedrockChat from langchain.chat_models.ernie import ErnieBotChat from langchain.chat_models.fake import FakeListChatModel from langchain.chat_models.google_palm import ChatGooglePalm from langchain.chat_models.human import HumanInputChatModel from langchain.chat_models.jinachat import JinaChat +from langchain.chat_models.konko import ChatKonko from langchain.chat_models.litellm import ChatLiteLLM from langchain.chat_models.mlflow_ai_gateway import ChatMLflowAIGateway from langchain.chat_models.ollama import ChatOllama @@ -36,7 +36,6 @@ from langchain.chat_models.vertexai import ChatVertexAI __all__ = [ "ChatOpenAI", "AzureChatOpenAI", - "BedrockChat", "FakeListChatModel", "PromptLayerChatOpenAI", "ChatAnthropic", @@ -49,4 +48,5 @@ __all__ = [ "ChatAnyscale", "ChatLiteLLM", "ErnieBotChat", + "ChatKonko", ] diff --git a/libs/langchain/langchain/chat_models/konko.py b/libs/langchain/langchain/chat_models/konko.py new file mode 100644 index 0000000000..b7b9bc6581 --- /dev/null +++ b/libs/langchain/langchain/chat_models/konko.py @@ -0,0 +1,292 @@ +"""KonkoAI chat wrapper.""" +from __future__ import annotations + +import logging +import os +from typing import ( + Any, + Dict, + Iterator, + List, + Mapping, + Optional, + Set, + Tuple, + Union, +) + +import requests + +from langchain.adapters.openai import convert_dict_to_message, convert_message_to_dict +from langchain.callbacks.manager import ( + CallbackManagerForLLMRun, +) +from langchain.chat_models.openai import ChatOpenAI, _convert_delta_to_message_chunk +from langchain.pydantic_v1 import Field, root_validator +from langchain.schema import ChatGeneration, ChatResult +from langchain.schema.messages import AIMessageChunk, BaseMessage +from langchain.schema.output import ChatGenerationChunk +from langchain.utils import get_from_dict_or_env + +DEFAULT_API_BASE = "https://api.konko.ai/v1" +DEFAULT_MODEL = "meta-llama/Llama-2-13b-chat-hf" + +logger = logging.getLogger(__name__) + + +class ChatKonko(ChatOpenAI): + """`ChatKonko` Chat large language models API. + + To use, you should have the ``konko`` python package installed, and the + environment variable ``KONKO_API_KEY`` and ``OPENAI_API_KEY`` set with your API key. + + Any parameters that are valid to be passed to the konko.create call can be passed + in, even if not explicitly saved on this class. + + Example: + .. code-block:: python + + from langchain.chat_models import ChatKonko + llm = ChatKonko(model="meta-llama/Llama-2-13b-chat-hf") + """ + + @property + def lc_secrets(self) -> Dict[str, str]: + return {"konko_api_key": "KONKO_API_KEY", "openai_api_key": "OPENAI_API_KEY"} + + @property + def lc_serializable(self) -> bool: + return True + + client: Any = None #: :meta private: + model: str = Field(default=DEFAULT_MODEL, alias="model") + """Model name to use.""" + temperature: float = 0.7 + """What sampling temperature to use.""" + model_kwargs: Dict[str, Any] = Field(default_factory=dict) + """Holds any model parameters valid for `create` call not explicitly specified.""" + openai_api_key: Optional[str] = None + konko_api_key: Optional[str] = None + request_timeout: Optional[Union[float, Tuple[float, float]]] = None + """Timeout for requests to Konko completion API.""" + max_retries: int = 6 + """Maximum number of retries to make when generating.""" + streaming: bool = False + """Whether to stream the results or not.""" + n: int = 1 + """Number of chat completions to generate for each prompt.""" + max_tokens: int = 20 + """Maximum number of tokens to generate.""" + + @root_validator() + def validate_environment(cls, values: Dict) -> Dict: + """Validate that api key and python package exists in environment.""" + values["konko_api_key"] = get_from_dict_or_env( + values, "konko_api_key", "KONKO_API_KEY" + ) + try: + import konko + + except ImportError: + raise ValueError( + "Could not import konko python package. " + "Please install it with `pip install konko`." + ) + try: + values["client"] = konko.ChatCompletion + except AttributeError: + raise ValueError( + "`konko` has no `ChatCompletion` attribute, this is likely " + "due to an old version of the konko package. Try upgrading it " + "with `pip install --upgrade konko`." + ) + if values["n"] < 1: + raise ValueError("n must be at least 1.") + if values["n"] > 1 and values["streaming"]: + raise ValueError("n must be 1 when streaming.") + return values + + @property + def _default_params(self) -> Dict[str, Any]: + """Get the default parameters for calling Konko API.""" + return { + "model": self.model, + "request_timeout": self.request_timeout, + "max_tokens": self.max_tokens, + "stream": self.streaming, + "n": self.n, + "temperature": self.temperature, + **self.model_kwargs, + } + + @staticmethod + def get_available_models( + konko_api_key: Optional[str] = None, + openai_api_key: Optional[str] = None, + konko_api_base: str = DEFAULT_API_BASE, + ) -> Set[str]: + """Get available models from Konko API.""" + + # Try to retrieve the OpenAI API key if it's not passed as an argument + if not openai_api_key: + try: + openai_api_key = os.environ["OPENAI_API_KEY"] + except KeyError: + pass # It's okay if it's not set, we just won't use it + + # Try to retrieve the Konko API key if it's not passed as an argument + if not konko_api_key: + try: + konko_api_key = os.environ["KONKO_API_KEY"] + except KeyError: + raise ValueError( + "Konko API key must be passed as keyword argument or " + "set in environment variable KONKO_API_KEY." + ) + + models_url = f"{konko_api_base}/models" + + headers = { + "Authorization": f"Bearer {konko_api_key}", + } + + if openai_api_key: + headers["X-OpenAI-Api-Key"] = openai_api_key + + models_response = requests.get(models_url, headers=headers) + + if models_response.status_code != 200: + raise ValueError( + f"Error getting models from {models_url}: " + f"{models_response.status_code}" + ) + + return {model["id"] for model in models_response.json()["data"]} + + def completion_with_retry( + self, run_manager: Optional[CallbackManagerForLLMRun] = None, **kwargs: Any + ) -> Any: + def _completion_with_retry(**kwargs: Any) -> Any: + return self.client.create(**kwargs) + + return _completion_with_retry(**kwargs) + + def _combine_llm_outputs(self, llm_outputs: List[Optional[dict]]) -> dict: + overall_token_usage: dict = {} + for output in llm_outputs: + if output is None: + # Happens in streaming + continue + token_usage = output["token_usage"] + for k, v in token_usage.items(): + if k in overall_token_usage: + overall_token_usage[k] += v + else: + overall_token_usage[k] = v + return {"token_usage": overall_token_usage, "model_name": self.model} + + def _stream( + self, + messages: List[BaseMessage], + stop: Optional[List[str]] = None, + run_manager: Optional[CallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> Iterator[ChatGenerationChunk]: + message_dicts, params = self._create_message_dicts(messages, stop) + params = {**params, **kwargs, "stream": True} + + default_chunk_class = AIMessageChunk + for chunk in self.completion_with_retry( + messages=message_dicts, run_manager=run_manager, **params + ): + if len(chunk["choices"]) == 0: + continue + choice = chunk["choices"][0] + chunk = _convert_delta_to_message_chunk( + choice["delta"], default_chunk_class + ) + finish_reason = choice.get("finish_reason") + generation_info = ( + dict(finish_reason=finish_reason) if finish_reason is not None else None + ) + default_chunk_class = chunk.__class__ + yield ChatGenerationChunk(message=chunk, generation_info=generation_info) + if run_manager: + run_manager.on_llm_new_token(chunk.content, chunk=chunk) + + def _generate( + self, + messages: List[BaseMessage], + stop: Optional[List[str]] = None, + run_manager: Optional[CallbackManagerForLLMRun] = None, + stream: Optional[bool] = None, + **kwargs: Any, + ) -> ChatResult: + if stream if stream is not None else self.streaming: + generation: Optional[ChatGenerationChunk] = None + for chunk in self._stream( + messages=messages, stop=stop, run_manager=run_manager, **kwargs + ): + if generation is None: + generation = chunk + else: + generation += chunk + assert generation is not None + return ChatResult(generations=[generation]) + + message_dicts, params = self._create_message_dicts(messages, stop) + params = {**params, **kwargs} + response = self.completion_with_retry( + messages=message_dicts, run_manager=run_manager, **params + ) + return self._create_chat_result(response) + + def _create_message_dicts( + self, messages: List[BaseMessage], stop: Optional[List[str]] + ) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]: + params = self._client_params + if stop is not None: + if "stop" in params: + raise ValueError("`stop` found in both the input and default params.") + params["stop"] = stop + message_dicts = [convert_message_to_dict(m) for m in messages] + return message_dicts, params + + def _create_chat_result(self, response: Mapping[str, Any]) -> ChatResult: + generations = [] + for res in response["choices"]: + message = convert_dict_to_message(res["message"]) + gen = ChatGeneration( + message=message, + generation_info=dict(finish_reason=res.get("finish_reason")), + ) + generations.append(gen) + token_usage = response.get("usage", {}) + llm_output = {"token_usage": token_usage, "model_name": self.model} + return ChatResult(generations=generations, llm_output=llm_output) + + @property + def _identifying_params(self) -> Dict[str, Any]: + """Get the identifying parameters.""" + return {**{"model_name": self.model}, **self._default_params} + + @property + def _client_params(self) -> Dict[str, Any]: + """Get the parameters used for the konko client.""" + return {**self._default_params} + + def _get_invocation_params( + self, stop: Optional[List[str]] = None, **kwargs: Any + ) -> Dict[str, Any]: + """Get the parameters used to invoke the model.""" + return { + "model": self.model, + **super()._get_invocation_params(stop=stop), + **self._default_params, + **kwargs, + } + + @property + def _llm_type(self) -> str: + """Return type of chat model.""" + return "konko-chat" diff --git a/libs/langchain/langchain/document_loaders/url_playwright.py b/libs/langchain/langchain/document_loaders/url_playwright.py index 16f5b00fd3..7aa60cf188 100644 --- a/libs/langchain/langchain/document_loaders/url_playwright.py +++ b/libs/langchain/langchain/document_loaders/url_playwright.py @@ -8,7 +8,9 @@ from langchain.docstore.document import Document from langchain.document_loaders.base import BaseLoader if TYPE_CHECKING: - from playwright.async_api import AsyncBrowser, AsyncPage, AsyncResponse + from playwright.async_api import Browser as AsyncBrowser + from playwright.async_api import Page as AsyncPage + from playwright.async_api import Response as AsyncResponse from playwright.sync_api import Browser, Page, Response @@ -155,6 +157,9 @@ class PlaywrightURLLoader(BaseLoader): try: page = browser.new_page() response = page.goto(url) + if response is None: + raise ValueError(f"page.goto() returned None for url {url}") + text = self.evaluator.evaluate(page, browser, response) metadata = {"source": url} docs.append(Document(page_content=text, metadata=metadata)) @@ -185,6 +190,9 @@ class PlaywrightURLLoader(BaseLoader): try: page = await browser.new_page() response = await page.goto(url) + if response is None: + raise ValueError(f"page.goto() returned None for url {url}") + text = await self.evaluator.evaluate_async(page, browser, response) metadata = {"source": url} docs.append(Document(page_content=text, metadata=metadata)) diff --git a/libs/langchain/langchain/embeddings/__init__.py b/libs/langchain/langchain/embeddings/__init__.py index 87cb5e90d5..e8aa683a9a 100644 --- a/libs/langchain/langchain/embeddings/__init__.py +++ b/libs/langchain/langchain/embeddings/__init__.py @@ -35,6 +35,7 @@ from langchain.embeddings.gpt4all import GPT4AllEmbeddings from langchain.embeddings.huggingface import ( HuggingFaceBgeEmbeddings, HuggingFaceEmbeddings, + HuggingFaceInferenceAPIEmbeddings, HuggingFaceInstructEmbeddings, ) from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings @@ -69,6 +70,7 @@ __all__ = [ "CohereEmbeddings", "ElasticsearchEmbeddings", "HuggingFaceEmbeddings", + "HuggingFaceInferenceAPIEmbeddings", "JinaEmbeddings", "LlamaCppEmbeddings", "HuggingFaceHubEmbeddings", diff --git a/libs/langchain/langchain/embeddings/ernie.py b/libs/langchain/langchain/embeddings/ernie.py index b8213651ad..37723b53ab 100644 --- a/libs/langchain/langchain/embeddings/ernie.py +++ b/libs/langchain/langchain/embeddings/ernie.py @@ -1,5 +1,7 @@ +import asyncio import logging import threading +from functools import partial from typing import Dict, List, Optional import requests @@ -14,6 +16,7 @@ logger = logging.getLogger(__name__) class ErnieEmbeddings(BaseModel, Embeddings): """`Ernie Embeddings V1` embedding models.""" + ernie_api_base: Optional[str] = None ernie_client_id: Optional[str] = None ernie_client_secret: Optional[str] = None access_token: Optional[str] = None @@ -26,6 +29,9 @@ class ErnieEmbeddings(BaseModel, Embeddings): @root_validator() def validate_environment(cls, values: Dict) -> Dict: + values["ernie_api_base"] = get_from_dict_or_env( + values, "ernie_api_base", "ERNIE_API_BASE", "https://aip.baidubce.com" + ) values["ernie_client_id"] = get_from_dict_or_env( values, "ernie_client_id", @@ -40,7 +46,7 @@ class ErnieEmbeddings(BaseModel, Embeddings): def _embedding(self, json: object) -> dict: base_url = ( - "https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/embeddings" + f"{self.ernie_api_base}/rpc/2.0/ai_custom/v1/wenxinworkshop/embeddings" ) resp = requests.post( f"{base_url}/embedding-v1", @@ -71,6 +77,15 @@ class ErnieEmbeddings(BaseModel, Embeddings): self.access_token = str(resp.json().get("access_token")) def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Embed search docs. + + Args: + texts: The list of texts to embed + + Returns: + List[List[float]]: List of embeddings, one for each text. + """ + if not self.access_token: self._refresh_access_token_with_lock() text_in_chunks = [ @@ -90,6 +105,15 @@ class ErnieEmbeddings(BaseModel, Embeddings): return lst def embed_query(self, text: str) -> List[float]: + """Embed query text. + + Args: + text: The text to embed. + + Returns: + List[float]: Embeddings for the text. + """ + if not self.access_token: self._refresh_access_token_with_lock() resp = self._embedding({"input": [text]}) @@ -100,3 +124,31 @@ class ErnieEmbeddings(BaseModel, Embeddings): else: raise ValueError(f"Error from Ernie: {resp}") return resp["data"][0]["embedding"] + + async def aembed_query(self, text: str) -> List[float]: + """Asynchronous Embed query text. + + Args: + text: The text to embed. + + Returns: + List[float]: Embeddings for the text. + """ + + return await asyncio.get_running_loop().run_in_executor( + None, partial(self.embed_query, text) + ) + + async def aembed_documents(self, texts: List[str]) -> List[List[float]]: + """Asynchronous Embed search docs. + + Args: + texts: The list of texts to embed + + Returns: + List[List[float]]: List of embeddings, one for each text. + """ + + result = await asyncio.gather(*[self.aembed_query(text) for text in texts]) + + return list(result) diff --git a/libs/langchain/langchain/embeddings/huggingface.py b/libs/langchain/langchain/embeddings/huggingface.py index 52afabd79b..a91d643793 100644 --- a/libs/langchain/langchain/embeddings/huggingface.py +++ b/libs/langchain/langchain/embeddings/huggingface.py @@ -1,5 +1,7 @@ from typing import Any, Dict, List, Optional +import requests + from langchain.embeddings.base import Embeddings from langchain.pydantic_v1 import BaseModel, Extra, Field @@ -58,7 +60,7 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings): except ImportError as exc: raise ImportError( "Could not import sentence_transformers python package. " - "Please install it with `pip install sentence_transformers`." + "Please install it with `pip install sentence-transformers`." ) from exc self.client = sentence_transformers.SentenceTransformer( @@ -266,3 +268,71 @@ class HuggingFaceBgeEmbeddings(BaseModel, Embeddings): self.query_instruction + text, **self.encode_kwargs ) return embedding.tolist() + + +class HuggingFaceInferenceAPIEmbeddings(BaseModel, Embeddings): + """Embed texts using the HuggingFace API. + + Requires a HuggingFace Inference API key and a model name. + """ + + api_key: str + """Your API key for the HuggingFace Inference API.""" + model_name: str = "sentence-transformers/all-MiniLM-L6-v2" + """The name of the model to use for text embeddings.""" + + @property + def _api_url(self) -> str: + return ( + "https://api-inference.huggingface.co" + "/pipeline" + "/feature-extraction" + f"/{self.model_name}" + ) + + @property + def _headers(self) -> dict: + return {"Authorization": f"Bearer {self.api_key}"} + + def embed_documents(self, texts: List[str]) -> List[List[float]]: + """Get the embeddings for a list of texts. + + Args: + texts (Documents): A list of texts to get embeddings for. + + Returns: + Embedded texts as List[List[float]], where each inner List[float] + corresponds to a single input text. + + Example: + .. code-block:: python + + from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings + + hf_embeddings = HuggingFaceInferenceAPIEmbeddings( + api_key="your_api_key", + model_name="sentence-transformers/all-MiniLM-l6-v2" + ) + texts = ["Hello, world!", "How are you?"] + hf_embeddings.embed_documents(texts) + """ + response = requests.post( + self._api_url, + headers=self._headers, + json={ + "inputs": texts, + "options": {"wait_for_model": True, "use_cache": True}, + }, + ) + return response.json() + + def embed_query(self, text: str) -> List[float]: + """Compute query embeddings using a HuggingFace transformer model. + + Args: + text: The text to embed. + + Returns: + Embeddings for the text. + """ + return self.embed_documents([text])[0] diff --git a/libs/langchain/langchain/graphs/graph_document.py b/libs/langchain/langchain/graphs/graph_document.py new file mode 100644 index 0000000000..9f72a3ad8e --- /dev/null +++ b/libs/langchain/langchain/graphs/graph_document.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +from typing import List, Union + +from langchain.load.serializable import Serializable +from langchain.pydantic_v1 import Field +from langchain.schema import Document + + +class Node(Serializable): + """Represents a node in a graph with associated properties. + + Attributes: + id (Union[str, int]): A unique identifier for the node. + type (str): The type or label of the node, default is "Node". + properties (dict): Additional properties and metadata associated with the node. + """ + + id: Union[str, int] + type: str = "Node" + properties: dict = Field(default_factory=dict) + + +class Relationship(Serializable): + """Represents a directed relationship between two nodes in a graph. + + Attributes: + source (Node): The source node of the relationship. + target (Node): The target node of the relationship. + type (str): The type of the relationship. + properties (dict): Additional properties associated with the relationship. + """ + + source: Node + target: Node + type: str + properties: dict = Field(default_factory=dict) + + +class GraphDocument(Serializable): + """Represents a graph document consisting of nodes and relationships. + + Attributes: + nodes (List[Node]): A list of nodes in the graph. + relationships (List[Relationship]): A list of relationships in the graph. + source (Document): The document from which the graph information is derived. + """ + + nodes: List[Node] + relationships: List[Relationship] + source: Document diff --git a/libs/langchain/langchain/graphs/neo4j_graph.py b/libs/langchain/langchain/graphs/neo4j_graph.py index 02572b2d1a..256df9d26b 100644 --- a/libs/langchain/langchain/graphs/neo4j_graph.py +++ b/libs/langchain/langchain/graphs/neo4j_graph.py @@ -1,5 +1,7 @@ from typing import Any, Dict, List +from langchain.graphs.graph_document import GraphDocument + node_properties_query = """ CALL apoc.meta.data() YIELD label, other, elementType, type, property @@ -99,3 +101,56 @@ class Neo4jGraph: The relationships are the following: {[el['output'] for el in relationships]} """ + + def add_graph_documents( + self, graph_documents: List[GraphDocument], include_source: bool = False + ) -> None: + """ + Take GraphDocument as input as uses it to construct a graph. + """ + for document in graph_documents: + include_docs_query = ( + "CREATE (d:Document) " + "SET d.text = $document.page_content " + "SET d += $document.metadata " + "WITH d " + ) + # Import nodes + self.query( + ( + f"{include_docs_query if include_source else ''}" + "UNWIND $data AS row " + "CALL apoc.merge.node([row.type], {id: row.id}, " + "row.properties, {}) YIELD node " + f"{'MERGE (d)-[:MENTIONS]->(node) ' if include_source else ''}" + "RETURN distinct 'done' AS result" + ), + { + "data": [el.__dict__ for el in document.nodes], + "document": document.source.__dict__, + }, + ) + # Import relationships + self.query( + "UNWIND $data AS row " + "CALL apoc.merge.node([row.source_label], {id: row.source}," + "{}, {}) YIELD node as source " + "CALL apoc.merge.node([row.target_label], {id: row.target}," + "{}, {}) YIELD node as target " + "CALL apoc.merge.relationship(source, row.type, " + "{}, row.properties, target) YIELD rel " + "RETURN distinct 'done'", + { + "data": [ + { + "source": el.source.id, + "source_label": el.source.type, + "target": el.target.id, + "target_label": el.target.type, + "type": el.type.replace(" ", "_").upper(), + "properties": el.properties, + } + for el in document.relationships + ] + }, + ) diff --git a/libs/langchain/langchain/llms/__init__.py b/libs/langchain/langchain/llms/__init__.py index d8736cfaae..34debd4810 100644 --- a/libs/langchain/langchain/llms/__init__.py +++ b/libs/langchain/langchain/llms/__init__.py @@ -37,6 +37,7 @@ from langchain.llms.chatglm import ChatGLM from langchain.llms.clarifai import Clarifai from langchain.llms.cohere import Cohere from langchain.llms.ctransformers import CTransformers +from langchain.llms.ctranslate2 import CTranslate2 from langchain.llms.databricks import Databricks from langchain.llms.deepinfra import DeepInfra from langchain.llms.deepsparse import DeepSparse @@ -100,6 +101,7 @@ __all__ = [ "Beam", "Bedrock", "CTransformers", + "CTranslate2", "CerebriumAI", "ChatGLM", "Clarifai", @@ -178,6 +180,7 @@ type_to_cls_dict: Dict[str, Type[BaseLLM]] = { "clarifai": Clarifai, "cohere": Cohere, "ctransformers": CTransformers, + "ctranslate2": CTranslate2, "databricks": Databricks, "deepinfra": DeepInfra, "deepsparse": DeepSparse, diff --git a/libs/langchain/langchain/llms/bananadev.py b/libs/langchain/langchain/llms/bananadev.py index f0659118d6..3a984a3cb2 100644 --- a/libs/langchain/langchain/llms/bananadev.py +++ b/libs/langchain/langchain/llms/bananadev.py @@ -15,6 +15,7 @@ class Banana(LLM): To use, you should have the ``banana-dev`` python package installed, and the environment variable ``BANANA_API_KEY`` set with your API key. + This is the team API key available in the Banana dashboard. Any parameters that are valid to be passed to the call can be passed in, even if not explicitly saved on this class. @@ -23,10 +24,13 @@ class Banana(LLM): .. code-block:: python from langchain.llms import Banana - banana = Banana(model_key="") + banana = Banana(model_key="", model_url_slug="") """ model_key: str = "" + """model key to use""" + + model_url_slug: str = "" """model endpoint to use""" model_kwargs: Dict[str, Any] = Field(default_factory=dict) @@ -72,6 +76,7 @@ class Banana(LLM): """Get the identifying parameters.""" return { **{"model_key": self.model_key}, + **{"model_url_slug": self.model_url_slug}, **{"model_kwargs": self.model_kwargs}, } @@ -89,7 +94,7 @@ class Banana(LLM): ) -> str: """Call to Banana endpoint.""" try: - import banana_dev as banana + from banana_dev import Client except ImportError: raise ImportError( "Could not import banana-dev python package. " @@ -99,19 +104,25 @@ class Banana(LLM): params = {**params, **kwargs} api_key = self.banana_api_key model_key = self.model_key + model_url_slug = self.model_url_slug model_inputs = { # a json specific to your model. "prompt": prompt, **params, } - response = banana.run(api_key, model_key, model_inputs) + model = Client( + # Found in main dashboard + api_key=api_key, + # Both found in model details page + model_key=model_key, + url=f"https://{model_url_slug}.run.banana.dev", + ) + response, meta = model.call("/", model_inputs) try: - text = response["modelOutputs"][0]["output"] + text = response["outputs"] except (KeyError, TypeError): - returned = response["modelOutputs"][0] raise ValueError( - "Response should be of schema: {'output': 'text'}." - f"\nResponse was: {returned}" + "Response should be of schema: {'outputs': 'text'}." "\nTo fix this:" "\n- fork the source repo of the Banana model" "\n- modify app.py to return the above schema" diff --git a/libs/langchain/langchain/llms/ctranslate2.py b/libs/langchain/langchain/llms/ctranslate2.py new file mode 100644 index 0000000000..b6180d674d --- /dev/null +++ b/libs/langchain/langchain/llms/ctranslate2.py @@ -0,0 +1,128 @@ +from typing import Any, Dict, List, Optional, Union + +from langchain.callbacks.manager import CallbackManagerForLLMRun +from langchain.llms.base import BaseLLM +from langchain.pydantic_v1 import Field, root_validator +from langchain.schema.output import Generation, LLMResult + + +class CTranslate2(BaseLLM): + """CTranslate2 language model.""" + + model_path: str = "" + """Path to the CTranslate2 model directory.""" + + tokenizer_name: str = "" + """Name of the original Hugging Face model needed to load the proper tokenizer.""" + + device: str = "cpu" + """Device to use (possible values are: cpu, cuda, auto).""" + + device_index: Union[int, List[int]] = 0 + """Device IDs where to place this generator on.""" + + compute_type: Union[str, Dict[str, str]] = "default" + """ + Model computation type or a dictionary mapping a device name to the computation type + (possible values are: default, auto, int8, int8_float32, int8_float16, + int8_bfloat16, int16, float16, bfloat16, float32). + """ + + max_length: int = 512 + """Maximum generation length.""" + + sampling_topk: int = 1 + """Randomly sample predictions from the top K candidates.""" + + sampling_topp: float = 1 + """Keep the most probable tokens whose cumulative probability exceeds this value.""" + + sampling_temperature: float = 1 + """Sampling temperature to generate more random samples.""" + + client: Any #: :meta private: + + tokenizer: Any #: :meta private: + + ctranslate2_kwargs: Dict[str, Any] = Field(default_factory=dict) + """ + Holds any model parameters valid for `ctranslate2.Generator` call not + explicitly specified. + """ + + @root_validator() + def validate_environment(cls, values: Dict) -> Dict: + """Validate that python package exists in environment.""" + + try: + import ctranslate2 + except ImportError: + raise ImportError( + "Could not import ctranslate2 python package. " + "Please install it with `pip install ctranslate2`." + ) + + try: + import transformers + except ImportError: + raise ImportError( + "Could not import transformers python package. " + "Please install it with `pip install transformers`." + ) + + values["client"] = ctranslate2.Generator( + model_path=values["model_path"], + device=values["device"], + device_index=values["device_index"], + compute_type=values["compute_type"], + **values["ctranslate2_kwargs"], + ) + + values["tokenizer"] = transformers.AutoTokenizer.from_pretrained( + values["tokenizer_name"] + ) + + return values + + @property + def _default_params(self) -> Dict[str, Any]: + """Get the default parameters.""" + return { + "max_length": self.max_length, + "sampling_topk": self.sampling_topk, + "sampling_topp": self.sampling_topp, + "sampling_temperature": self.sampling_temperature, + } + + def _generate( + self, + prompts: List[str], + stop: Optional[List[str]] = None, + run_manager: Optional[CallbackManagerForLLMRun] = None, + **kwargs: Any, + ) -> LLMResult: + # build sampling parameters + params = {**self._default_params, **kwargs} + + # call the model + encoded_prompts = self.tokenizer(prompts)["input_ids"] + tokenized_prompts = [ + self.tokenizer.convert_ids_to_tokens(encoded_prompt) + for encoded_prompt in encoded_prompts + ] + + results = self.client.generate_batch(tokenized_prompts, **params) + + sequences = [result.sequences_ids[0] for result in results] + decoded_sequences = [self.tokenizer.decode(seq) for seq in sequences] + + generations = [] + for text in decoded_sequences: + generations.append([Generation(text=text)]) + + return LLMResult(generations=generations) + + @property + def _llm_type(self) -> str: + """Return type of llm.""" + return "ctranslate2" diff --git a/libs/langchain/langchain/llms/rwkv.py b/libs/langchain/langchain/llms/rwkv.py index bb54c9d5de..8072b2b91b 100644 --- a/libs/langchain/langchain/llms/rwkv.py +++ b/libs/langchain/langchain/llms/rwkv.py @@ -121,7 +121,7 @@ class RWKV(LLM, BaseModel): values["pipeline"] = PIPELINE(values["client"], values["tokens_path"]) except ImportError: - raise ValueError( + raise ImportError( "Could not import rwkv python package. " "Please install it with `pip install rwkv`." ) diff --git a/libs/langchain/langchain/llms/vertexai.py b/libs/langchain/langchain/llms/vertexai.py index 2426587f58..aaa5efbecb 100644 --- a/libs/langchain/langchain/llms/vertexai.py +++ b/libs/langchain/langchain/llms/vertexai.py @@ -169,7 +169,7 @@ class VertexAI(_VertexAICommon, LLM): tuned_model_name = values.get("tuned_model_name") model_name = values["model_name"] try: - if tuned_model_name or not is_codey_model(model_name): + if not is_codey_model(model_name): from vertexai.preview.language_models import TextGenerationModel if tuned_model_name: @@ -181,7 +181,12 @@ class VertexAI(_VertexAICommon, LLM): else: from vertexai.preview.language_models import CodeGenerationModel - values["client"] = CodeGenerationModel.from_pretrained(model_name) + if tuned_model_name: + values["client"] = CodeGenerationModel.get_tuned_model( + tuned_model_name + ) + else: + values["client"] = CodeGenerationModel.from_pretrained(model_name) except ImportError: raise_vertex_import_error() return values diff --git a/libs/langchain/langchain/memory/chat_message_histories/dynamodb.py b/libs/langchain/langchain/memory/chat_message_histories/dynamodb.py index 704efa9ea7..06d7897dbd 100644 --- a/libs/langchain/langchain/memory/chat_message_histories/dynamodb.py +++ b/libs/langchain/langchain/memory/chat_message_histories/dynamodb.py @@ -1,5 +1,7 @@ +from __future__ import annotations + import logging -from typing import Dict, List, Optional +from typing import TYPE_CHECKING, Dict, List, Optional from langchain.schema import ( BaseChatMessageHistory, @@ -11,6 +13,9 @@ from langchain.schema.messages import ( messages_to_dict, ) +if TYPE_CHECKING: + from boto3.session import Session + logger = logging.getLogger(__name__) @@ -42,13 +47,21 @@ class DynamoDBChatMessageHistory(BaseChatMessageHistory): endpoint_url: Optional[str] = None, primary_key_name: str = "SessionId", key: Optional[Dict[str, str]] = None, + boto3_session: Optional[Session] = None, ): - import boto3 - - if endpoint_url: - client = boto3.resource("dynamodb", endpoint_url=endpoint_url) + if boto3_session: + client = boto3_session.resource("dynamodb") else: - client = boto3.resource("dynamodb") + try: + import boto3 + except ImportError as e: + raise ImportError( + "Unable to import boto3, please install with `pip install boto3`." + ) from e + if endpoint_url: + client = boto3.resource("dynamodb", endpoint_url=endpoint_url) + else: + client = boto3.resource("dynamodb") self.table = client.Table(table_name) self.session_id = session_id self.key: Dict = key or {primary_key_name: session_id} @@ -56,7 +69,12 @@ class DynamoDBChatMessageHistory(BaseChatMessageHistory): @property def messages(self) -> List[BaseMessage]: # type: ignore """Retrieve the messages from DynamoDB""" - from botocore.exceptions import ClientError + try: + from botocore.exceptions import ClientError + except ImportError as e: + raise ImportError( + "Unable to import botocore, please install with `pip install botocore`." + ) from e response = None try: @@ -77,7 +95,12 @@ class DynamoDBChatMessageHistory(BaseChatMessageHistory): def add_message(self, message: BaseMessage) -> None: """Append the message to the record in DynamoDB""" - from botocore.exceptions import ClientError + try: + from botocore.exceptions import ClientError + except ImportError as e: + raise ImportError( + "Unable to import botocore, please install with `pip install botocore`." + ) from e messages = messages_to_dict(self.messages) _message = _message_to_dict(message) @@ -90,7 +113,12 @@ class DynamoDBChatMessageHistory(BaseChatMessageHistory): def clear(self) -> None: """Clear session memory from DynamoDB""" - from botocore.exceptions import ClientError + try: + from botocore.exceptions import ClientError + except ImportError as e: + raise ImportError( + "Unable to import botocore, please install with `pip install botocore`." + ) from e try: self.table.delete_item(self.key) diff --git a/libs/langchain/langchain/prompts/chat.py b/libs/langchain/langchain/prompts/chat.py index 59fcb30c86..5b177bc124 100644 --- a/libs/langchain/langchain/prompts/chat.py +++ b/libs/langchain/langchain/prompts/chat.py @@ -229,7 +229,7 @@ class ChatMessagePromptTemplate(BaseStringMessagePromptTemplate): class HumanMessagePromptTemplate(BaseStringMessagePromptTemplate): - """Human message prompt template. This is a message that is sent to the user.""" + """Human message prompt template. This is a message sent from the user.""" def format(self, **kwargs: Any) -> BaseMessage: """Format the prompt template. @@ -245,7 +245,7 @@ class HumanMessagePromptTemplate(BaseStringMessagePromptTemplate): class AIMessagePromptTemplate(BaseStringMessagePromptTemplate): - """AI message prompt template. This is a message that is not sent to the user.""" + """AI message prompt template. This is a message sent from the AI.""" def format(self, **kwargs: Any) -> BaseMessage: """Format the prompt template. diff --git a/libs/langchain/langchain/retrievers/self_query/base.py b/libs/langchain/langchain/retrievers/self_query/base.py index 0251bff52c..2a7d53277b 100644 --- a/libs/langchain/langchain/retrievers/self_query/base.py +++ b/libs/langchain/langchain/retrievers/self_query/base.py @@ -2,8 +2,8 @@ from typing import Any, Dict, List, Optional, Type, cast -from langchain import LLMChain from langchain.callbacks.manager import CallbackManagerForRetrieverRun +from langchain.chains import LLMChain from langchain.chains.query_constructor.base import load_query_constructor_chain from langchain.chains.query_constructor.ir import StructuredQuery, Visitor from langchain.chains.query_constructor.schema import AttributeInfo @@ -16,6 +16,9 @@ from langchain.retrievers.self_query.milvus import MilvusTranslator from langchain.retrievers.self_query.myscale import MyScaleTranslator from langchain.retrievers.self_query.pinecone import PineconeTranslator from langchain.retrievers.self_query.qdrant import QdrantTranslator +from langchain.retrievers.self_query.redis import RedisTranslator +from langchain.retrievers.self_query.supabase import SupabaseVectorTranslator +from langchain.retrievers.self_query.vectara import VectaraTranslator from langchain.retrievers.self_query.weaviate import WeaviateTranslator from langchain.schema import BaseRetriever, Document from langchain.schema.language_model import BaseLanguageModel @@ -28,6 +31,9 @@ from langchain.vectorstores import ( MyScale, Pinecone, Qdrant, + Redis, + SupabaseVectorStore, + Vectara, VectorStore, Weaviate, ) @@ -35,28 +41,32 @@ from langchain.vectorstores import ( def _get_builtin_translator(vectorstore: VectorStore) -> Visitor: """Get the translator class corresponding to the vector store class.""" - vectorstore_cls = vectorstore.__class__ BUILTIN_TRANSLATORS: Dict[Type[VectorStore], Type[Visitor]] = { Pinecone: PineconeTranslator, Chroma: ChromaTranslator, DashVector: DashvectorTranslator, Weaviate: WeaviateTranslator, + Vectara: VectaraTranslator, Qdrant: QdrantTranslator, MyScale: MyScaleTranslator, DeepLake: DeepLakeTranslator, ElasticsearchStore: ElasticsearchTranslator, Milvus: MilvusTranslator, + SupabaseVectorStore: SupabaseVectorTranslator, } - if vectorstore_cls not in BUILTIN_TRANSLATORS: - raise ValueError( - f"Self query retriever with Vector Store type {vectorstore_cls}" - f" not supported." - ) if isinstance(vectorstore, Qdrant): return QdrantTranslator(metadata_key=vectorstore.metadata_payload_key) elif isinstance(vectorstore, MyScale): return MyScaleTranslator(metadata_key=vectorstore.metadata_column) - return BUILTIN_TRANSLATORS[vectorstore_cls]() + elif isinstance(vectorstore, Redis): + return RedisTranslator.from_vectorstore(vectorstore) + elif vectorstore.__class__ in BUILTIN_TRANSLATORS: + return BUILTIN_TRANSLATORS[vectorstore.__class__]() + else: + raise ValueError( + f"Self query retriever with Vector Store type {vectorstore.__class__}" + f" not supported." + ) class SelfQueryRetriever(BaseRetriever, BaseModel): @@ -74,8 +84,9 @@ class SelfQueryRetriever(BaseRetriever, BaseModel): structured_query_translator: Visitor """Translator for turning internal query language into vectorstore search params.""" verbose: bool = False - """Use original query instead of the revised new query from LLM""" + use_original_query: bool = False + """Use original query instead of the revised new query from LLM""" class Config: """Configuration for this pydantic object.""" diff --git a/libs/langchain/langchain/retrievers/self_query/redis.py b/libs/langchain/langchain/retrievers/self_query/redis.py new file mode 100644 index 0000000000..963b58aa68 --- /dev/null +++ b/libs/langchain/langchain/retrievers/self_query/redis.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +from typing import Any, Tuple + +from langchain.chains.query_constructor.ir import ( + Comparator, + Comparison, + Operation, + Operator, + StructuredQuery, + Visitor, +) +from langchain.vectorstores.redis import Redis +from langchain.vectorstores.redis.filters import ( + RedisFilterExpression, + RedisFilterField, + RedisFilterOperator, + RedisNum, + RedisTag, + RedisText, +) +from langchain.vectorstores.redis.schema import RedisModel + +_COMPARATOR_TO_BUILTIN_METHOD = { + Comparator.EQ: "__eq__", + Comparator.NE: "__ne__", + Comparator.LT: "__lt__", + Comparator.GT: "__gt__", + Comparator.LTE: "__le__", + Comparator.GTE: "__ge__", + Comparator.CONTAIN: "__eq__", + Comparator.LIKE: "__mod__", +} + + +class RedisTranslator(Visitor): + """Translate""" + + allowed_comparators = ( + Comparator.EQ, + Comparator.NE, + Comparator.LT, + Comparator.LTE, + Comparator.GT, + Comparator.GTE, + Comparator.CONTAIN, + Comparator.LIKE, + ) + """Subset of allowed logical comparators.""" + allowed_operators = (Operator.AND, Operator.OR) + """Subset of allowed logical operators.""" + + def __init__(self, schema: RedisModel) -> None: + self._schema = schema + + def _attribute_to_filter_field(self, attribute: str) -> RedisFilterField: + if attribute in [tf.name for tf in self._schema.text]: + return RedisText(attribute) + elif attribute in [tf.name for tf in self._schema.tag or []]: + return RedisTag(attribute) + elif attribute in [tf.name for tf in self._schema.numeric or []]: + return RedisNum(attribute) + else: + raise ValueError( + f"Invalid attribute {attribute} not in vector store schema. Schema is:" + f"\n{self._schema.as_dict()}" + ) + + def visit_comparison(self, comparison: Comparison) -> RedisFilterExpression: + filter_field = self._attribute_to_filter_field(comparison.attribute) + comparison_method = _COMPARATOR_TO_BUILTIN_METHOD[comparison.comparator] + return getattr(filter_field, comparison_method)(comparison.value) + + def visit_operation(self, operation: Operation) -> Any: + left = operation.arguments[0].accept(self) + if len(operation.arguments) > 2: + right = self.visit_operation( + Operation( + operator=operation.operator, arguments=operation.arguments[1:] + ) + ) + else: + right = operation.arguments[1].accept(self) + redis_operator = ( + RedisFilterOperator.OR + if operation.operator == Operator.OR + else RedisFilterOperator.AND + ) + return RedisFilterExpression(operator=redis_operator, left=left, right=right) + + def visit_structured_query( + self, structured_query: StructuredQuery + ) -> Tuple[str, dict]: + if structured_query.filter is None: + kwargs = {} + else: + kwargs = {"filter": structured_query.filter.accept(self)} + return structured_query.query, kwargs + + @classmethod + def from_vectorstore(cls, vectorstore: Redis) -> RedisTranslator: + return cls(vectorstore._schema) diff --git a/libs/langchain/langchain/retrievers/self_query/supabase.py b/libs/langchain/langchain/retrievers/self_query/supabase.py new file mode 100644 index 0000000000..267e228fcd --- /dev/null +++ b/libs/langchain/langchain/retrievers/self_query/supabase.py @@ -0,0 +1,97 @@ +from typing import Any, Dict, Tuple + +from langchain.chains.query_constructor.ir import ( + Comparator, + Comparison, + Operation, + Operator, + StructuredQuery, + Visitor, +) + + +class SupabaseVectorTranslator(Visitor): + """Translate Langchain filters to Supabase PostgREST filters.""" + + allowed_operators = [Operator.AND, Operator.OR] + """Subset of allowed logical operators.""" + + allowed_comparators = [ + Comparator.EQ, + Comparator.NE, + Comparator.GT, + Comparator.GTE, + Comparator.LT, + Comparator.LTE, + Comparator.LIKE, + ] + """Subset of allowed logical comparators.""" + + metadata_column = "metadata" + + def _map_comparator(self, comparator: Comparator) -> str: + """ + Maps Langchain comparator to PostgREST comparator: + + https://postgrest.org/en/stable/references/api/tables_views.html#operators + """ + postgrest_comparator = { + Comparator.EQ: "eq", + Comparator.NE: "neq", + Comparator.GT: "gt", + Comparator.GTE: "gte", + Comparator.LT: "lt", + Comparator.LTE: "lte", + Comparator.LIKE: "like", + }.get(comparator) + + if postgrest_comparator is None: + raise Exception( + f"Comparator '{comparator}' is not currently " + "supported in Supabase Vector" + ) + + return postgrest_comparator + + def _get_json_operator(self, value: Any) -> str: + if isinstance(value, str): + return "->>" + else: + return "->" + + def visit_operation(self, operation: Operation) -> str: + args = [arg.accept(self) for arg in operation.arguments] + return f"{operation.operator.value}({','.join(args)})" + + def visit_comparison(self, comparison: Comparison) -> str: + if isinstance(comparison.value, list): + return self.visit_operation( + Operation( + operator=Operator.AND, + arguments=( + Comparison( + comparator=comparison.comparator, + attribute=comparison.attribute, + value=value, + ) + for value in comparison.value + ), + ) + ) + + return ".".join( + [ + f"{self.metadata_column}{self._get_json_operator(comparison.value)}{comparison.attribute}", + f"{self._map_comparator(comparison.comparator)}", + f"{comparison.value}", + ] + ) + + def visit_structured_query( + self, structured_query: StructuredQuery + ) -> Tuple[str, Dict[str, str]]: + if structured_query.filter is None: + kwargs = {} + else: + kwargs = {"postgrest_filter": structured_query.filter.accept(self)} + return structured_query.query, kwargs diff --git a/libs/langchain/langchain/retrievers/self_query/vectara.py b/libs/langchain/langchain/retrievers/self_query/vectara.py new file mode 100644 index 0000000000..73dc46ff59 --- /dev/null +++ b/libs/langchain/langchain/retrievers/self_query/vectara.py @@ -0,0 +1,69 @@ +from typing import Tuple, Union + +from langchain.chains.query_constructor.ir import ( + Comparator, + Comparison, + Operation, + Operator, + StructuredQuery, + Visitor, +) + + +def process_value(value: Union[int, float, str]) -> str: + if isinstance(value, str): + return f"'{value}'" + else: + return str(value) + + +class VectaraTranslator(Visitor): + """Translate `Vectara` internal query language elements to valid filters.""" + + allowed_operators = [Operator.AND, Operator.OR] + """Subset of allowed logical operators.""" + allowed_comparators = [ + Comparator.EQ, + Comparator.NE, + Comparator.GT, + Comparator.GTE, + Comparator.LT, + Comparator.LTE, + ] + """Subset of allowed logical comparators.""" + + def _format_func(self, func: Union[Operator, Comparator]) -> str: + map_dict = { + Operator.AND: " and ", + Operator.OR: " or ", + Comparator.EQ: "=", + Comparator.NE: "!=", + Comparator.GT: ">", + Comparator.GTE: ">=", + Comparator.LT: "<", + Comparator.LTE: "<=", + } + self._validate_func(func) + return map_dict[func] + + def visit_operation(self, operation: Operation) -> str: + args = [arg.accept(self) for arg in operation.arguments] + operator = self._format_func(operation.operator) + return "( " + operator.join(args) + " )" + + def visit_comparison(self, comparison: Comparison) -> str: + comparator = self._format_func(comparison.comparator) + processed_value = process_value(comparison.value) + attribute = comparison.attribute + return ( + "( " + "doc." + attribute + " " + comparator + " " + processed_value + " )" + ) + + def visit_structured_query( + self, structured_query: StructuredQuery + ) -> Tuple[str, dict]: + if structured_query.filter is None: + kwargs = {} + else: + kwargs = {"filter": structured_query.filter.accept(self)} + return structured_query.query, kwargs diff --git a/libs/langchain/langchain/schema/runnable/base.py b/libs/langchain/langchain/schema/runnable/base.py index 9b23e50438..51ccc58f9f 100644 --- a/libs/langchain/langchain/schema/runnable/base.py +++ b/libs/langchain/langchain/schema/runnable/base.py @@ -39,6 +39,8 @@ from langchain.load.serializable import Serializable from langchain.pydantic_v1 import Field from langchain.schema.runnable.config import ( RunnableConfig, + acall_func_with_variable_args, + call_func_with_variable_args, ensure_config, get_async_callback_manager_for_config, get_callback_manager_for_config, @@ -47,16 +49,15 @@ from langchain.schema.runnable.config import ( patch_config, ) from langchain.schema.runnable.utils import ( + Input, + Output, + accepts_config, accepts_run_manager, - accepts_run_manager_and_config, gather_with_concurrency, ) from langchain.utils.aiter import atee, py_anext from langchain.utils.iter import safetee -Input = TypeVar("Input") -# Output type should implement __concat__, as eg str, list, dict do -Output = TypeVar("Output") Other = TypeVar("Other") @@ -253,7 +254,7 @@ class Runnable(Generic[Input, Output], ABC): def with_retry( self, *, - retry_if_exception_type: Tuple[Type[BaseException]] = (Exception,), + retry_if_exception_type: Tuple[Type[BaseException], ...] = (Exception,), wait_exponential_jitter: bool = True, stop_after_attempt: int = 3, ) -> Runnable[Input, Output]: @@ -279,7 +280,7 @@ class Runnable(Generic[Input, Output], ABC): self, fallbacks: Sequence[Runnable[Input, Output]], *, - exceptions_to_handle: Tuple[Type[BaseException]] = (Exception,), + exceptions_to_handle: Tuple[Type[BaseException], ...] = (Exception,), ) -> RunnableWithFallbacks[Input, Output]: return RunnableWithFallbacks( runnable=self, @@ -311,16 +312,7 @@ class Runnable(Generic[Input, Output], ABC): name=config.get("run_name"), ) try: - if accepts_run_manager_and_config(func): - output = func( - input, - run_manager=run_manager, - config=config, - ) # type: ignore[call-arg] - elif accepts_run_manager(func): - output = func(input, run_manager=run_manager) # type: ignore[call-arg] - else: - output = func(input) # type: ignore[call-arg] + output = call_func_with_variable_args(func, input, run_manager, config) except Exception as e: run_manager.on_chain_error(e) raise @@ -353,19 +345,9 @@ class Runnable(Generic[Input, Output], ABC): name=config.get("run_name"), ) try: - if accepts_run_manager_and_config(func): - output = await func( - input, - run_manager=run_manager, - config=config, - ) # type: ignore[call-arg] - elif accepts_run_manager(func): - output = await func( - input, - run_manager=run_manager, - ) # type: ignore[call-arg] - else: - output = await func(input) # type: ignore[call-arg] + output = await acall_func_with_variable_args( + func, input, run_manager, config + ) except Exception as e: await run_manager.on_chain_error(e) raise @@ -408,16 +390,15 @@ class Runnable(Generic[Input, Output], ABC): ) ] try: - if accepts_run_manager_and_config(func): - output = func( - input, - run_manager=run_managers, - config=configs, - ) # type: ignore[call-arg] - elif accepts_run_manager(func): - output = func(input, run_manager=run_managers) # type: ignore[call-arg] - else: - output = func(input) # type: ignore[call-arg] + kwargs: Dict[str, Any] = {} + if accepts_config(func): + kwargs["config"] = [ + patch_config(c, callbacks=rm.get_child()) + for c, rm in zip(configs, run_managers) + ] + if accepts_run_manager(func): + kwargs["run_manager"] = run_managers + output = func(input, **kwargs) # type: ignore[call-arg] except Exception as e: for run_manager in run_managers: run_manager.on_chain_error(e) @@ -479,16 +460,15 @@ class Runnable(Generic[Input, Output], ABC): ) ) try: - if accepts_run_manager_and_config(func): - output = await func( - input, - run_manager=run_managers, - config=configs, - ) # type: ignore[call-arg] - elif accepts_run_manager(func): - output = await func(input, run_manager=run_managers) # type: ignore - else: - output = await func(input) # type: ignore[call-arg] + kwargs: Dict[str, Any] = {} + if accepts_config(func): + kwargs["config"] = [ + patch_config(c, callbacks=rm.get_child()) + for c, rm in zip(configs, run_managers) + ] + if accepts_run_manager(func): + kwargs["run_manager"] = run_managers + output = await func(input, **kwargs) # type: ignore[call-arg] except Exception as e: await asyncio.gather( *(run_manager.on_chain_error(e) for run_manager in run_managers) @@ -550,19 +530,16 @@ class Runnable(Generic[Input, Output], ABC): name=config.get("run_name"), ) try: - if accepts_run_manager_and_config(transformer): - iterator = transformer( - input_for_transform, - run_manager=run_manager, - config=config, - ) # type: ignore[call-arg] - elif accepts_run_manager(transformer): - iterator = transformer( - input_for_transform, - run_manager=run_manager, - ) # type: ignore[call-arg] - else: - iterator = transformer(input_for_transform) # type: ignore[call-arg] + kwargs: Dict[str, Any] = {} + if accepts_config(transformer): + kwargs["config"] = patch_config( + config, callbacks=run_manager.get_child() + ) + if accepts_run_manager(transformer): + kwargs["run_manager"] = run_manager + iterator = transformer( + input_for_transform, **kwargs + ) # type: ignore[call-arg] for chunk in iterator: yield chunk if final_output_supported: @@ -631,21 +608,16 @@ class Runnable(Generic[Input, Output], ABC): name=config.get("run_name"), ) try: - # mypy can't quite work out thew type guard here, but this is safe, - # check implementations of the accepts_* functions - if accepts_run_manager_and_config(transformer): - iterator = transformer( - input_for_transform, - run_manager=run_manager, - config=config, - ) # type: ignore[call-arg] - elif accepts_run_manager(transformer): - iterator = transformer( - input_for_transform, - run_manager=run_manager, - ) # type: ignore[call-arg] - else: - iterator = transformer(input_for_transform) # type: ignore[call-arg] + kwargs: Dict[str, Any] = {} + if accepts_config(transformer): + kwargs["config"] = patch_config( + config, callbacks=run_manager.get_child() + ) + if accepts_run_manager(transformer): + kwargs["run_manager"] = run_manager + iterator = transformer( + input_for_transform, **kwargs + ) # type: ignore[call-arg] async for chunk in iterator: yield chunk if final_output_supported: @@ -681,7 +653,7 @@ class RunnableWithFallbacks(Serializable, Runnable[Input, Output]): runnable: Runnable[Input, Output] fallbacks: Sequence[Runnable[Input, Output]] - exceptions_to_handle: Tuple[Type[BaseException]] = (Exception,) + exceptions_to_handle: Tuple[Type[BaseException], ...] = (Exception,) class Config: arbitrary_types_allowed = True @@ -1756,7 +1728,7 @@ class RunnableLambda(Runnable[Input, Output]): run_manager: CallbackManagerForChainRun, config: RunnableConfig, ) -> Output: - output = self.func(input) + output = call_func_with_variable_args(self.func, input, run_manager, config) # If the output is a runnable, invoke it if isinstance(output, Runnable): recursion_limit = config["recursion_limit"] @@ -1780,7 +1752,9 @@ class RunnableLambda(Runnable[Input, Output]): run_manager: AsyncCallbackManagerForChainRun, config: RunnableConfig, ) -> Output: - output = await self.afunc(input) + output = await acall_func_with_variable_args( + self.afunc, input, run_manager, config + ) # If the output is a runnable, invoke it if isinstance(output, Runnable): recursion_limit = config["recursion_limit"] @@ -1798,6 +1772,21 @@ class RunnableLambda(Runnable[Input, Output]): ) return output + def _config( + self, config: Optional[RunnableConfig], callable: Callable[..., Any] + ) -> RunnableConfig: + config = config or {} + + if config.get("run_name") is None: + try: + run_name = callable.__name__ + except AttributeError: + run_name = None + if run_name is not None: + return patch_config(config, run_name=run_name) + + return config + def invoke( self, input: Input, @@ -1805,7 +1794,11 @@ class RunnableLambda(Runnable[Input, Output]): **kwargs: Optional[Any], ) -> Output: if hasattr(self, "func"): - return self._call_with_config(self._invoke, input, config) + return self._call_with_config( + self._invoke, + input, + self._config(config, self.func), + ) else: raise TypeError( "Cannot invoke a coroutine function synchronously." @@ -1819,7 +1812,11 @@ class RunnableLambda(Runnable[Input, Output]): **kwargs: Optional[Any], ) -> Output: if hasattr(self, "afunc"): - return await self._acall_with_config(self._ainvoke, input, config) + return await self._acall_with_config( + self._ainvoke, + input, + self._config(config, self.afunc), + ) else: # Delegating to super implementation of ainvoke. # Uses asyncio executor to run the sync version (invoke) diff --git a/libs/langchain/langchain/schema/runnable/config.py b/libs/langchain/langchain/schema/runnable/config.py index 3f87f04403..987a2c7d2f 100644 --- a/libs/langchain/langchain/schema/runnable/config.py +++ b/libs/langchain/langchain/schema/runnable/config.py @@ -3,13 +3,35 @@ from __future__ import annotations from concurrent.futures import Executor, ThreadPoolExecutor from contextlib import contextmanager from copy import deepcopy -from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Union +from typing import ( + TYPE_CHECKING, + Any, + Awaitable, + Callable, + Dict, + Generator, + List, + Optional, + Union, +) from typing_extensions import TypedDict +from langchain.schema.runnable.utils import ( + Input, + Output, + accepts_config, + accepts_run_manager, +) + if TYPE_CHECKING: from langchain.callbacks.base import BaseCallbackManager, Callbacks - from langchain.callbacks.manager import AsyncCallbackManager, CallbackManager + from langchain.callbacks.manager import ( + AsyncCallbackManager, + AsyncCallbackManagerForChainRun, + CallbackManager, + CallbackManagerForChainRun, + ) class RunnableConfig(TypedDict, total=False): @@ -117,6 +139,47 @@ def patch_config( return config +def call_func_with_variable_args( + func: Union[ + Callable[[Input], Output], + Callable[[Input, CallbackManagerForChainRun], Output], + Callable[[Input, CallbackManagerForChainRun, RunnableConfig], Output], + ], + input: Input, + run_manager: CallbackManagerForChainRun, + config: RunnableConfig, +) -> Output: + """Call function that may optionally accept a run_manager and/or config.""" + kwargs: Dict[str, Any] = {} + if accepts_config(func): + kwargs["config"] = patch_config(config, callbacks=run_manager.get_child()) + if accepts_run_manager(func): + kwargs["run_manager"] = run_manager + return func(input, **kwargs) # type: ignore[call-arg] + + +async def acall_func_with_variable_args( + func: Union[ + Callable[[Input], Awaitable[Output]], + Callable[[Input, AsyncCallbackManagerForChainRun], Awaitable[Output]], + Callable[ + [Input, AsyncCallbackManagerForChainRun, RunnableConfig], + Awaitable[Output], + ], + ], + input: Input, + run_manager: AsyncCallbackManagerForChainRun, + config: RunnableConfig, +) -> Output: + """Call function that may optionally accept a run_manager and/or config.""" + kwargs: Dict[str, Any] = {} + if accepts_config(func): + kwargs["config"] = patch_config(config, callbacks=run_manager.get_child()) + if accepts_run_manager(func): + kwargs["run_manager"] = run_manager + return await func(input, **kwargs) # type: ignore[call-arg] + + def get_callback_manager_for_config(config: RunnableConfig) -> CallbackManager: from langchain.callbacks.manager import CallbackManager diff --git a/libs/langchain/langchain/schema/runnable/retry.py b/libs/langchain/langchain/schema/runnable/retry.py index 37de03f600..b41f74583b 100644 --- a/libs/langchain/langchain/schema/runnable/retry.py +++ b/libs/langchain/langchain/schema/runnable/retry.py @@ -24,7 +24,7 @@ U = TypeVar("U") class RunnableRetry(RunnableBinding[Input, Output]): """Retry a Runnable if it fails.""" - retry_exception_types: Tuple[Type[BaseException]] = (Exception,) + retry_exception_types: Tuple[Type[BaseException], ...] = (Exception,) wait_exponential_jitter: bool = True diff --git a/libs/langchain/langchain/schema/runnable/utils.py b/libs/langchain/langchain/schema/runnable/utils.py index 2afa3705c4..43d9b325fd 100644 --- a/libs/langchain/langchain/schema/runnable/utils.py +++ b/libs/langchain/langchain/schema/runnable/utils.py @@ -2,7 +2,11 @@ from __future__ import annotations import asyncio from inspect import signature -from typing import Any, Callable, Coroutine, Union +from typing import Any, Callable, Coroutine, TypeVar, Union + +Input = TypeVar("Input") +# Output type should implement __concat__, as eg str, list, dict do +Output = TypeVar("Output") async def gated_coro(semaphore: asyncio.Semaphore, coro: Coroutine) -> Any: @@ -26,8 +30,8 @@ def accepts_run_manager(callable: Callable[..., Any]) -> bool: return False -def accepts_run_manager_and_config(callable: Callable[..., Any]) -> bool: - return ( - accepts_run_manager(callable) - and signature(callable).parameters.get("config") is not None - ) +def accepts_config(callable: Callable[..., Any]) -> bool: + try: + return signature(callable).parameters.get("config") is not None + except ValueError: + return False diff --git a/libs/langchain/langchain/smith/evaluation/name_generation.py b/libs/langchain/langchain/smith/evaluation/name_generation.py new file mode 100644 index 0000000000..33ac3a6753 --- /dev/null +++ b/libs/langchain/langchain/smith/evaluation/name_generation.py @@ -0,0 +1,729 @@ +import random + +adjectives = [ + "abandoned", + "aching", + "advanced", + "ample", + "artistic", + "back", + "best", + "bold", + "brief", + "clear", + "cold", + "complicated", + "cooked", + "crazy", + "crushing", + "damp", + "dear", + "definite", + "dependable", + "diligent", + "drab", + "earnest", + "elderly", + "enchanted", + "essential", + "excellent", + "extraneous", + "fixed", + "flowery", + "formal", + "fresh", + "frosty", + "giving", + "glossy", + "healthy", + "helpful", + "impressionable", + "kind", + "large", + "left", + "long", + "loyal", + "mealy", + "memorable", + "monthly", + "new", + "notable", + "only", + "ordinary", + "passionate", + "perfect", + "pertinent", + "proper", + "puzzled", + "reflecting", + "respectful", + "roasted", + "scholarly", + "shiny", + "slight", + "sparkling", + "spotless", + "stupendous", + "sunny", + "tart", + "terrific", + "timely", + "unique", + "upbeat", + "vacant", + "virtual", + "warm", + "weary", + "whispered", + "worthwhile", + "yellow", +] + +nouns = [ + "account", + "acknowledgment", + "address", + "advertising", + "airplane", + "animal", + "appointment", + "arrival", + "artist", + "attachment", + "attitude", + "availability", + "backpack", + "bag", + "balance", + "bass", + "bean", + "beauty", + "bibliography", + "bill", + "bite", + "blossom", + "boat", + "book", + "box", + "boy", + "bread", + "bridge", + "broccoli", + "building", + "butter", + "button", + "cabbage", + "cake", + "camera", + "camp", + "candle", + "candy", + "canvas", + "car", + "card", + "carrot", + "cart", + "case", + "cat", + "chain", + "chair", + "chalk", + "chance", + "change", + "channel", + "character", + "charge", + "charm", + "chart", + "check", + "cheek", + "cheese", + "chef", + "cherry", + "chicken", + "child", + "church", + "circle", + "class", + "clay", + "click", + "clock", + "cloth", + "cloud", + "clove", + "club", + "coach", + "coal", + "coast", + "coat", + "cod", + "coffee", + "collar", + "color", + "comb", + "comfort", + "comic", + "committee", + "community", + "company", + "comparison", + "competition", + "condition", + "connection", + "control", + "cook", + "copper", + "copy", + "corn", + "cough", + "country", + "cover", + "crate", + "crayon", + "cream", + "creator", + "crew", + "crown", + "current", + "curtain", + "curve", + "cushion", + "dad", + "daughter", + "day", + "death", + "debt", + "decision", + "deer", + "degree", + "design", + "desire", + "desk", + "detail", + "development", + "digestion", + "dime", + "dinner", + "direction", + "dirt", + "discovery", + "discussion", + "disease", + "disgust", + "distance", + "distribution", + "division", + "doctor", + "dog", + "door", + "drain", + "drawer", + "dress", + "drink", + "driving", + "dust", + "ear", + "earth", + "edge", + "education", + "effect", + "egg", + "end", + "energy", + "engine", + "error", + "event", + "example", + "exchange", + "existence", + "expansion", + "experience", + "expert", + "eye", + "face", + "fact", + "fall", + "family", + "farm", + "father", + "fear", + "feeling", + "field", + "finger", + "fire", + "fish", + "flag", + "flight", + "floor", + "flower", + "fold", + "food", + "football", + "force", + "form", + "frame", + "friend", + "frog", + "fruit", + "fuel", + "furniture", + "game", + "garden", + "gate", + "girl", + "glass", + "glove", + "goat", + "gold", + "government", + "grade", + "grain", + "grass", + "green", + "grip", + "group", + "growth", + "guide", + "guitar", + "hair", + "hall", + "hand", + "harbor", + "harmony", + "hat", + "head", + "health", + "heart", + "heat", + "hill", + "history", + "hobbies", + "hole", + "hope", + "horn", + "horse", + "hospital", + "hour", + "house", + "humor", + "idea", + "impulse", + "income", + "increase", + "industry", + "ink", + "insect", + "instrument", + "insurance", + "interest", + "invention", + "iron", + "island", + "jelly", + "jet", + "jewel", + "join", + "judge", + "juice", + "jump", + "kettle", + "key", + "kick", + "kiss", + "kitten", + "knee", + "knife", + "knowledge", + "land", + "language", + "laugh", + "law", + "lead", + "learning", + "leather", + "leg", + "lettuce", + "level", + "library", + "lift", + "light", + "limit", + "line", + "linen", + "lip", + "liquid", + "list", + "look", + "loss", + "love", + "lunch", + "machine", + "man", + "manager", + "map", + "marble", + "mark", + "market", + "mass", + "match", + "meal", + "measure", + "meat", + "meeting", + "memory", + "metal", + "middle", + "milk", + "mind", + "mine", + "minute", + "mist", + "mitten", + "mom", + "money", + "monkey", + "month", + "moon", + "morning", + "mother", + "motion", + "mountain", + "mouth", + "muscle", + "music", + "nail", + "name", + "nation", + "neck", + "need", + "news", + "night", + "noise", + "note", + "number", + "nut", + "observation", + "offer", + "oil", + "operation", + "opinion", + "orange", + "order", + "organization", + "ornament", + "oven", + "page", + "pail", + "pain", + "paint", + "pan", + "pancake", + "paper", + "parcel", + "parent", + "part", + "passenger", + "paste", + "payment", + "peace", + "pear", + "pen", + "pencil", + "person", + "pest", + "pet", + "picture", + "pie", + "pin", + "pipe", + "pizza", + "place", + "plane", + "plant", + "plastic", + "plate", + "play", + "pleasure", + "plot", + "plough", + "pocket", + "point", + "poison", + "police", + "pollution", + "popcorn", + "porter", + "position", + "pot", + "potato", + "powder", + "power", + "price", + "print", + "process", + "produce", + "product", + "profit", + "property", + "prose", + "protest", + "pull", + "pump", + "punishment", + "purpose", + "push", + "quarter", + "question", + "quiet", + "quill", + "quilt", + "quince", + "rabbit", + "rail", + "rain", + "range", + "rat", + "rate", + "ray", + "reaction", + "reading", + "reason", + "record", + "regret", + "relation", + "religion", + "representative", + "request", + "respect", + "rest", + "reward", + "rhythm", + "rice", + "river", + "road", + "roll", + "room", + "root", + "rose", + "route", + "rub", + "rule", + "run", + "sack", + "sail", + "salt", + "sand", + "scale", + "scarecrow", + "scarf", + "scene", + "scent", + "school", + "science", + "scissors", + "screw", + "sea", + "seat", + "secretary", + "seed", + "selection", + "self", + "sense", + "servant", + "shade", + "shake", + "shame", + "shape", + "sheep", + "sheet", + "shelf", + "ship", + "shirt", + "shock", + "shoe", + "shop", + "show", + "side", + "sign", + "silk", + "sink", + "sister", + "size", + "sky", + "slave", + "sleep", + "smash", + "smell", + "smile", + "smoke", + "snail", + "snake", + "sneeze", + "snow", + "soap", + "society", + "sock", + "soda", + "sofa", + "son", + "song", + "sort", + "sound", + "soup", + "space", + "spark", + "speed", + "sponge", + "spoon", + "spray", + "spring", + "spy", + "square", + "stamp", + "star", + "start", + "statement", + "station", + "steam", + "steel", + "stem", + "step", + "stew", + "stick", + "stitch", + "stocking", + "stomach", + "stone", + "stop", + "store", + "story", + "stove", + "stranger", + "straw", + "stream", + "street", + "stretch", + "string", + "structure", + "substance", + "sugar", + "suggestion", + "suit", + "summer", + "sun", + "support", + "surprise", + "sweater", + "swim", + "system", + "table", + "tail", + "talk", + "tank", + "taste", + "tax", + "tea", + "teaching", + "team", + "tendency", + "test", + "texture", + "theory", + "thing", + "thought", + "thread", + "throat", + "thumb", + "thunder", + "ticket", + "time", + "tin", + "title", + "toad", + "toe", + "tooth", + "toothpaste", + "touch", + "town", + "toy", + "trade", + "train", + "transport", + "tray", + "treatment", + "tree", + "trick", + "trip", + "trouble", + "trousers", + "truck", + "tub", + "turkey", + "turn", + "twist", + "umbrella", + "uncle", + "underwear", + "unit", + "use", + "vacation", + "value", + "van", + "vase", + "vegetable", + "veil", + "vein", + "verse", + "vessel", + "view", + "visitor", + "voice", + "volcano", + "walk", + "wall", + "war", + "wash", + "waste", + "watch", + "water", + "wave", + "wax", + "way", + "wealth", + "weather", + "week", + "weight", + "wheel", + "whip", + "whistle", + "window", + "wine", + "wing", + "winter", + "wire", + "wish", + "woman", + "wood", + "wool", + "word", + "work", + "worm", + "wound", + "wrist", + "writer", + "yard", + "yoke", + "zebra", + "zinc", + "zipper", + "zone", +] + + +def random_name(prefix: str = "test") -> str: + """Generate a random name.""" + adjective = random.choice(adjectives) + noun = random.choice(nouns) + number = random.randint(1, 100) + + return f"{prefix}-{adjective}-{noun}-{number}" diff --git a/libs/langchain/langchain/smith/evaluation/progress.py b/libs/langchain/langchain/smith/evaluation/progress.py new file mode 100644 index 0000000000..1ea51eee42 --- /dev/null +++ b/libs/langchain/langchain/smith/evaluation/progress.py @@ -0,0 +1,82 @@ +"""A simple progress bar for the console.""" +import threading +from typing import Any, Dict, Optional, Sequence +from uuid import UUID + +from langchain.callbacks import base as base_callbacks +from langchain.schema.document import Document +from langchain.schema.output import LLMResult + + +class ProgressBarCallback(base_callbacks.BaseCallbackHandler): + """A simple progress bar for the console.""" + + def __init__(self, total: int, ncols: int = 50, **kwargs: Any): + """Initialize the progress bar. + + Args: + total: int, the total number of items to be processed. + ncols: int, the character width of the progress bar. + """ + self.total = total + self.ncols = ncols + self.counter = 0 + self.lock = threading.Lock() + self._print_bar() + + def increment(self) -> None: + """Increment the counter and update the progress bar.""" + with self.lock: + self.counter += 1 + self._print_bar() + + def _print_bar(self) -> None: + """Print the progress bar to the console.""" + progress = self.counter / self.total + arrow = "-" * int(round(progress * self.ncols) - 1) + ">" + spaces = " " * (self.ncols - len(arrow)) + print(f"\r[{arrow + spaces}] {self.counter}/{self.total}", end="") + + def on_chain_end( + self, + outputs: Dict[str, Any], + *, + run_id: UUID, + parent_run_id: Optional[UUID] = None, + **kwargs: Any, + ) -> Any: + if parent_run_id is None: + self.increment() + + def on_retriever_end( + self, + documents: Sequence[Document], + *, + run_id: UUID, + parent_run_id: Optional[UUID] = None, + **kwargs: Any, + ) -> Any: + if parent_run_id is None: + self.increment() + + def on_llm_end( + self, + response: LLMResult, + *, + run_id: UUID, + parent_run_id: Optional[UUID] = None, + **kwargs: Any, + ) -> Any: + if parent_run_id is None: + self.increment() + + def on_tool_end( + self, + output: str, + *, + run_id: UUID, + parent_run_id: Optional[UUID] = None, + **kwargs: Any, + ) -> Any: + if parent_run_id is None: + self.increment() diff --git a/libs/langchain/langchain/smith/evaluation/runner_utils.py b/libs/langchain/langchain/smith/evaluation/runner_utils.py index 1f432053e2..e6dfe827f6 100644 --- a/libs/langchain/langchain/smith/evaluation/runner_utils.py +++ b/libs/langchain/langchain/smith/evaluation/runner_utils.py @@ -2,21 +2,16 @@ from __future__ import annotations -import asyncio import functools import inspect -import itertools import logging -import uuid import warnings from enum import Enum from typing import ( TYPE_CHECKING, Any, Callable, - Coroutine, Dict, - Iterator, List, Optional, Sequence, @@ -24,16 +19,13 @@ from typing import ( Union, cast, ) -from urllib.parse import urlparse, urlunparse from langsmith import Client, RunEvaluator from langsmith.schemas import Dataset, DataType, Example -from langchain.callbacks.base import BaseCallbackHandler from langchain.callbacks.manager import Callbacks -from langchain.callbacks.tracers.base import BaseTracer from langchain.callbacks.tracers.evaluation import EvaluatorCallbackHandler -from langchain.callbacks.tracers.langchain import LangChainTracer +from langchain.callbacks.tracers.langchain import LangChainTracer, wait_for_all_tracers from langchain.chains.base import Chain from langchain.evaluation.loading import load_evaluator from langchain.evaluation.schema import EvaluatorType, StringEvaluator @@ -41,8 +33,11 @@ from langchain.schema import ChatResult, LLMResult from langchain.schema.language_model import BaseLanguageModel from langchain.schema.messages import BaseMessage, messages_from_dict from langchain.schema.runnable import Runnable, RunnableConfig, RunnableLambda -from langchain.smith.evaluation.config import EvalConfig, RunEvalConfig -from langchain.smith.evaluation.string_run_evaluator import StringRunEvaluatorChain +from langchain.schema.runnable import config as runnable_config +from langchain.schema.runnable import utils as runnable_utils +from langchain.smith import evaluation as smith_eval +from langchain.smith.evaluation import config as smith_eval_config +from langchain.smith.evaluation import name_generation, progress if TYPE_CHECKING: import pandas as pd @@ -69,6 +64,26 @@ class InputFormatError(Exception): class TestResult(dict): """A dictionary of the results of a single test run.""" + def get_aggregate_feedback( + self, quantiles: Optional[Sequence[float]] = None + ) -> pd.DataFrame: + """Return quantiles for the feedback scores. + + This method calculates and prints the quantiles for the feedback scores + across all feedback keys. + + Returns: + A DataFrame containing the quantiles for each feedback key. + """ + df = self.to_dataframe() + feedback_cols = [ + col for col in df.columns if col not in ["input", "output", "reference"] + ] + _quantiles = df[feedback_cols].quantile( + quantiles or [0.25, 0.5, 0.75], numeric_only=True + ) + return _quantiles.transpose() + def to_dataframe(self) -> pd.DataFrame: """Convert the results to a dataframe.""" try: @@ -83,27 +98,19 @@ class TestResult(dict): records = [] for example_id, result in self["results"].items(): feedback = result["feedback"] - records.append( - {**{f.key: f.score for f in feedback}, "output": result["output"]} - ) + r = { + **{f.key: f.score for f in feedback}, + "input": result["input"], + "output": result["output"], + } + if "reference" in result: + r["reference"] = result["reference"] + records.append(r) indices.append(example_id) return pd.DataFrame(records, index=indices) -def _get_eval_project_url(api_url: str, project_id: str) -> str: - """Get the project url from the api url.""" - parsed = urlparse(api_url) - hostname = parsed.hostname or "" - if "api." in hostname: - hostname = hostname.replace("api.", "", 1) - if "localhost" in hostname: - # Remove the port - hostname = "localhost" - url = urlunparse(parsed._replace(netloc=hostname)) - return f"{url}/projects/p/{project_id}?eval=true" - - def _wrap_in_chain_factory( llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY, dataset_name: str = "<my_dataset>", @@ -172,15 +179,6 @@ def _wrap_in_chain_factory( return llm_or_chain_factory -def _first_example(examples: Iterator[Example]) -> Tuple[Example, Iterator[Example]]: - """Get the first example while chaining it back and preserving the iterator.""" - try: - example: Example = next(examples) - except StopIteration: - raise ValueError("No examples provided.") - return example, itertools.chain([example], examples) - - def _get_prompt(inputs: Dict[str, Any]) -> str: """Get prompt from inputs. @@ -277,31 +275,7 @@ def _get_messages(inputs: Dict[str, Any]) -> List[BaseMessage]: ) -def _get_project_name( - project_name: Optional[str], - llm_or_chain_factory: MCF, -) -> str: - """ - Get the project name. - - Args: - project_name: The project name if manually specified. - llm_or_chain_factory: The Chain or language model constructor. - - Returns: - The project name. - """ - if project_name is not None: - return project_name - if isinstance(llm_or_chain_factory, BaseLanguageModel): - model_name = llm_or_chain_factory.__class__.__name__ - else: - model_name = llm_or_chain_factory().__class__.__name__ - hex = uuid.uuid4().hex - return f"{hex}-{model_name}" - - -## Shared Validation Utilities +## Shared data validation utilities def _validate_example_inputs_for_language_model( first_example: Example, input_mapper: Optional[Callable[[Dict], Any]], @@ -373,22 +347,20 @@ def _validate_example_inputs_for_chain( def _validate_example_inputs( - examples: Iterator[Example], + example: Example, llm_or_chain_factory: MCF, input_mapper: Optional[Callable[[Dict], Any]], -) -> Iterator[Example]: +) -> None: """Validate that the example inputs are valid for the model.""" - first_example, examples = _first_example(examples) if isinstance(llm_or_chain_factory, BaseLanguageModel): - _validate_example_inputs_for_language_model(first_example, input_mapper) + _validate_example_inputs_for_language_model(example, input_mapper) else: chain = llm_or_chain_factory() if isinstance(chain, Chain): # Otherwise it's a runnable - _validate_example_inputs_for_chain(first_example, chain, input_mapper) + _validate_example_inputs_for_chain(example, chain, input_mapper) elif isinstance(chain, Runnable): logger.debug(f"Skipping input validation for {chain}") - return examples ## Shared Evaluator Setup Utilities @@ -396,13 +368,12 @@ def _validate_example_inputs( def _setup_evaluation( llm_or_chain_factory: MCF, - examples: Iterator[Example], - evaluation: Optional[RunEvalConfig], + examples: List[Example], + evaluation: Optional[smith_eval.RunEvalConfig], data_type: DataType, -) -> Tuple[Optional[List[RunEvaluator]], Iterator[Example]]: +) -> Optional[List[RunEvaluator]]: """Configure the evaluators to run on the results of the chain.""" if evaluation: - first_example, examples = _first_example(examples) if isinstance(llm_or_chain_factory, BaseLanguageModel): run_inputs, run_outputs = None, None run_type = "llm" @@ -422,18 +393,18 @@ def _setup_evaluation( evaluation, run_type, data_type, - list(first_example.outputs) if first_example.outputs else None, + list(examples[0].outputs) if examples[0].outputs else None, run_inputs, run_outputs, ) else: # TODO: Create a default helpfulness evaluator run_evaluators = None - return run_evaluators, examples + return run_evaluators def _determine_input_key( - config: RunEvalConfig, + config: smith_eval.RunEvalConfig, run_inputs: Optional[List[str]], ) -> Optional[str]: input_key = None @@ -452,7 +423,7 @@ def _determine_input_key( def _determine_prediction_key( - config: RunEvalConfig, + config: smith_eval.RunEvalConfig, run_outputs: Optional[List[str]], ) -> Optional[str]: prediction_key = None @@ -473,7 +444,7 @@ def _determine_prediction_key( def _determine_reference_key( - config: RunEvalConfig, + config: smith_eval.RunEvalConfig, example_outputs: Optional[List[str]], ) -> Optional[str]: if config.reference_key: @@ -491,7 +462,7 @@ def _determine_reference_key( def _construct_run_evaluator( - eval_config: Union[EvaluatorType, str, EvalConfig], + eval_config: Union[EvaluatorType, str, smith_eval_config.EvalConfig], eval_llm: Optional[BaseLanguageModel], run_type: str, data_type: DataType, @@ -513,11 +484,11 @@ def _construct_run_evaluator( if isinstance(evaluator_, StringEvaluator): if evaluator_.requires_reference and reference_key is None: raise ValueError( - f"Must specify reference_key in RunEvalConfig to use" + f"Must specify reference_key in smith_eval.RunEvalConfig to use" f" evaluator of type {eval_type_tag} with" f" dataset with multiple output keys: {example_outputs}." ) - run_evaluator = StringRunEvaluatorChain.from_run_and_data_type( + run_evaluator = smith_eval.StringRunEvaluatorChain.from_run_and_data_type( evaluator_, run_type, data_type, @@ -534,7 +505,7 @@ def _construct_run_evaluator( def _get_keys( - config: RunEvalConfig, + config: smith_eval.RunEvalConfig, run_inputs: Optional[List[str]], run_outputs: Optional[List[str]], example_outputs: Optional[List[str]], @@ -546,7 +517,7 @@ def _get_keys( def _load_run_evaluators( - config: RunEvalConfig, + config: smith_eval.RunEvalConfig, run_type: str, data_type: DataType, example_outputs: Optional[List[str]], @@ -593,7 +564,7 @@ def _load_run_evaluators( run_evaluators.append(custom_evaluator) elif isinstance(custom_evaluator, StringEvaluator): run_evaluators.append( - StringRunEvaluatorChain.from_run_and_data_type( + smith_eval.StringRunEvaluatorChain.from_run_and_data_type( custom_evaluator, run_type, data_type, @@ -694,10 +665,9 @@ async def _arun_chain( async def _arun_llm_or_chain( example: Example, - llm_or_chain_factory: MCF, + config: RunnableConfig, *, - tags: Optional[List[str]] = None, - callbacks: Optional[List[BaseCallbackHandler]] = None, + llm_or_chain_factory: MCF, input_mapper: Optional[Callable[[Dict], Any]] = None, ) -> Union[dict, str, LLMResult, ChatResult]: """Asynchronously run the Chain or language model. @@ -712,15 +682,6 @@ async def _arun_llm_or_chain( Returns: A list of outputs. """ - if callbacks: - previous_example_ids = [ - getattr(tracer, "example_id", None) for tracer in callbacks - ] - for tracer in callbacks: - if hasattr(tracer, "example_id"): - tracer.example_id = example.id - else: - previous_example_ids = None chain_or_llm = ( "LLM" if isinstance(llm_or_chain_factory, BaseLanguageModel) else "Chain" ) @@ -730,8 +691,8 @@ async def _arun_llm_or_chain( output: Any = await _arun_llm( llm_or_chain_factory, example.inputs, - tags=tags, - callbacks=callbacks, + tags=config["tags"], + callbacks=config["callbacks"], input_mapper=input_mapper, ) else: @@ -739,198 +700,19 @@ async def _arun_llm_or_chain( output = await _arun_chain( chain, example.inputs, - tags=tags, - callbacks=callbacks, + tags=config["tags"], + callbacks=config["callbacks"], input_mapper=input_mapper, ) result = output except Exception as e: - logger.warning(f"{chain_or_llm} failed for example {example.id}. Error: {e}") - result = {"Error": str(e)} - if callbacks and previous_example_ids: - for example_id, tracer in zip(previous_example_ids, callbacks): - if hasattr(tracer, "example_id"): - tracer.example_id = example_id - return result - - -async def _gather_with_concurrency( - n: int, - initializer: Callable[[], Coroutine[Any, Any, Any]], - *async_funcs: Callable[ - [Sequence[BaseCallbackHandler], Dict], Coroutine[Any, Any, Any] - ], -) -> List[Any]: - """Run coroutines with a concurrency limit. - - Args: - n: The maximum number of concurrent tasks. - initializer: A coroutine that initializes shared resources for the tasks. - async_funcs: The async_funcs to be run concurrently. - - Returns: - A list of results from the coroutines. - """ - semaphore = asyncio.Semaphore(n) - job_state = {"num_processed": 0} - - callback_queue: asyncio.Queue[Sequence[BaseCallbackHandler]] = asyncio.Queue() - for _ in range(n): - callback_queue.put_nowait(await initializer()) - - async def run_coroutine_with_semaphore( - async_func: Callable[ - [Sequence[BaseCallbackHandler], Dict], Coroutine[Any, Any, Any] - ] - ) -> Any: - async with semaphore: - callbacks = await callback_queue.get() - try: - result = await async_func(callbacks, job_state) - finally: - callback_queue.put_nowait(callbacks) - return result - - results = await asyncio.gather( - *(run_coroutine_with_semaphore(function) for function in async_funcs) - ) - while callback_queue: - try: - callbacks = callback_queue.get_nowait() - except asyncio.QueueEmpty: - break - for callback in callbacks: - if isinstance(callback, (LangChainTracer, EvaluatorCallbackHandler)): - callback.wait_for_futures() - return results - - -async def _callbacks_initializer( - project_name: Optional[str], - client: Client, - run_evaluators: Sequence[RunEvaluator], - evaluation_handler_collector: List[EvaluatorCallbackHandler], -) -> List[BaseTracer]: - """ - Initialize a tracer to share across tasks. - - Args: - project_name: The project name for the tracer. - client: The client to use for the tracer. - run_evaluators: The evaluators to run. - evaluation_handler_collector: A list to collect the evaluators. - Used to wait for the evaluators to finish. - - Returns: - The callbacks for this thread. - """ - callbacks: List[BaseTracer] = [] - if project_name: - callbacks.append( - LangChainTracer( - project_name=project_name, client=client, use_threading=False - ) - ) - if run_evaluators: - callback = EvaluatorCallbackHandler( - client=client, - evaluators=run_evaluators, - # We already have concurrency, don't want to overload the machine - max_workers=1, - ) - callbacks.append(callback) - evaluation_handler_collector.append(callback) - return callbacks - - -async def _arun_on_examples( - client: Client, - examples: Iterator[Example], - llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY, - *, - evaluation: Optional[RunEvalConfig] = None, - concurrency_level: int = 5, - project_name: Optional[str] = None, - verbose: bool = False, - tags: Optional[List[str]] = None, - input_mapper: Optional[Callable[[Dict], Any]] = None, - data_type: DataType = DataType.kv, -) -> Dict[str, Any]: - """ - Asynchronously run the chain on examples and store traces - to the specified project name. - - Args: - client: LangSmith client to use to log feedback and runs. - examples: Examples to run the model or chain over. - llm_or_chain_factory: Language model or Chain constructor to run - over the dataset. The Chain constructor is used to permit - independent calls on each example without carrying over state. - evaluation: Optional evaluation configuration to use when evaluating - concurrency_level: The number of async tasks to run concurrently. - project_name: Project name to use when tracing runs. - Defaults to {dataset_name}-{chain class name}-{datetime}. - verbose: Whether to print progress. - tags: Tags to add to each run in the project. - input_mapper: function to map to the inputs dictionary from an Example - to the format expected by the model to be evaluated. This is useful if - your model needs to deserialize more complex schema or if your dataset - has inputs with keys that differ from what is expected by your chain - or agent. - data_type: The dataset's data type. This is used to determine determine - how to deserialize the reference data and model compatibility. - Returns: - A dictionary mapping example ids to the model outputs. - """ - wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory) - project_name = _get_project_name(project_name, wrapped_model) - run_evaluators, examples = _setup_evaluation( - wrapped_model, examples, evaluation, data_type - ) - examples = _validate_example_inputs(examples, wrapped_model, input_mapper) - results: Dict[str, dict] = {} - - async def process_example( - example: Example, callbacks: List[BaseCallbackHandler], job_state: dict - ) -> None: - """Process a single example.""" - result = await _arun_llm_or_chain( - example, - wrapped_model, - tags=tags, - callbacks=callbacks, - input_mapper=input_mapper, + logger.warning( + f"{chain_or_llm} failed for example {example.id} " + f"with inputs {example.inputs}" + f"\n{repr(e)}" ) - results[str(example.id)] = {"output": result} - job_state["num_processed"] += 1 - if verbose: - print( - f"Processed examples: {job_state['num_processed']}", - end="\r", - flush=True, - ) - - evaluation_handlers: List[EvaluatorCallbackHandler] = [] - await _gather_with_concurrency( - concurrency_level, - functools.partial( - _callbacks_initializer, - project_name=project_name, - client=client, - evaluation_handler_collector=evaluation_handlers, - run_evaluators=run_evaluators or [], - ), - *(functools.partial(process_example, e) for e in examples), - ) - all_feedback = {} - for handler in evaluation_handlers: - handler.wait_for_futures() - all_feedback.update(handler.logged_feedback) - # join the results and feedback on the example id - for example_id, output_dict in results.items(): - feedback = all_feedback.get(example_id, []) - output_dict["feedback"] = feedback - return results + result = {"Error": repr(e)} + return result ## Sync Utilities @@ -1011,10 +793,9 @@ def _run_chain( def _run_llm_or_chain( example: Example, - llm_or_chain_factory: MCF, + config: RunnableConfig, *, - tags: Optional[List[str]] = None, - callbacks: Optional[List[BaseCallbackHandler]] = None, + llm_or_chain_factory: MCF, input_mapper: Optional[Callable[[Dict], Any]] = None, ) -> Union[dict, str, LLMResult, ChatResult]: """ @@ -1030,15 +811,6 @@ def _run_llm_or_chain( Union[List[dict], List[str], List[LLMResult], List[ChatResult]]: The outputs of the model or chain. """ - if callbacks: - previous_example_ids = [ - getattr(tracer, "example_id", None) for tracer in callbacks - ] - for tracer in callbacks: - if hasattr(tracer, "example_id"): - tracer.example_id = example.id - else: - previous_example_ids = None chain_or_llm = ( "LLM" if isinstance(llm_or_chain_factory, BaseLanguageModel) else "Chain" ) @@ -1048,8 +820,8 @@ def _run_llm_or_chain( output: Any = _run_llm( llm_or_chain_factory, example.inputs, - callbacks, - tags=tags, + config["callbacks"], + tags=config["tags"], input_mapper=input_mapper, ) else: @@ -1057,98 +829,22 @@ def _run_llm_or_chain( output = _run_chain( chain, example.inputs, - callbacks, - tags=tags, + config["callbacks"], + tags=config["tags"], input_mapper=input_mapper, ) result = output except Exception as e: + error_type = type(e).__name__ logger.warning( - f"{chain_or_llm} failed for example {example.id} with inputs:" - f" {example.inputs}.\nError: {e}", + f"{chain_or_llm} failed for example {example.id} " + f"with inputs {example.inputs}" + f"\nError Type: {error_type}, Message: {e}" ) - result = {"Error": str(e)} - if callbacks and previous_example_ids: - for example_id, tracer in zip(previous_example_ids, callbacks): - if hasattr(tracer, "example_id"): - tracer.example_id = example_id + result = {"Error": repr(e)} return result -def _run_on_examples( - client: Client, - examples: Iterator[Example], - llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY, - *, - evaluation: Optional[RunEvalConfig] = None, - project_name: Optional[str] = None, - verbose: bool = False, - tags: Optional[List[str]] = None, - input_mapper: Optional[Callable[[Dict], Any]] = None, - data_type: DataType = DataType.kv, -) -> Dict[str, Any]: - """ - Run the Chain or language model on examples and store - traces to the specified project name. - - Args: - client: LangSmith client to use to log feedback and runs. - examples: Examples to run the model or chain over. - llm_or_chain_factory: Language model or Chain constructor to run - over the dataset. The Chain constructor is used to permit - independent calls on each example without carrying over state. - evaluation: Optional evaluation configuration to use when evaluating - project_name: Name of the project to store the traces in. - Defaults to {dataset_name}-{chain class name}-{datetime}. - verbose: Whether to print progress. - tags: Tags to add to each run in the project. - input_mapper: A function to map to the inputs dictionary from an Example - to the format expected by the model to be evaluated. This is useful if - your model needs to deserialize more complex schema or if your dataset - has inputs with keys that differ from what is expected by your chain - or agent. - data_type: The dataset's data type. This is used to determine determine - how to deserialize the reference data and model compatibility. - - Returns: - A dictionary mapping example ids to the model outputs. - """ - results: Dict[str, dict] = {} - wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory) - project_name = _get_project_name(project_name, wrapped_model) - tracer = LangChainTracer( - project_name=project_name, client=client, use_threading=False - ) - run_evaluators, examples = _setup_evaluation( - wrapped_model, examples, evaluation, data_type - ) - examples = _validate_example_inputs(examples, wrapped_model, input_mapper) - evaluation_handler = EvaluatorCallbackHandler( - evaluators=run_evaluators or [], - client=client, - ) - callbacks: List[BaseCallbackHandler] = [tracer, evaluation_handler] - for i, example in enumerate(examples): - result = _run_llm_or_chain( - example, - wrapped_model, - tags=tags, - callbacks=callbacks, - input_mapper=input_mapper, - ) - if verbose: - print(f"{i+1} processed", flush=True, end="\r") - results[str(example.id)] = {"output": result} - tracer.wait_for_futures() - evaluation_handler.wait_for_futures() - all_feedback = evaluation_handler.logged_feedback - # join the results and feedback on the example id - for example_id, output_dict in results.items(): - feedback = all_feedback.get(example_id, []) - output_dict["feedback"] = feedback - return results - - ## Public API @@ -1156,10 +852,9 @@ def _prepare_eval_run( client: Client, dataset_name: str, llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY, - project_name: Optional[str], -) -> Tuple[MCF, str, Dataset, Iterator[Example]]: + project_name: str, +) -> Tuple[MCF, str, Dataset, List[Example]]: wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory, dataset_name) - project_name = _get_project_name(project_name, wrapped_model) try: project = client.create_project(project_name) except ValueError as e: @@ -1168,21 +863,95 @@ def _prepare_eval_run( raise ValueError( f"Project {project_name} already exists. Please use a different name." ) - project_url = _get_eval_project_url(client.api_url, project.id) print( - f"View the evaluation results for project '{project_name}' at:\n{project_url}" + f"View the evaluation results for project '{project_name}' at:\n{project.url}" ) dataset = client.read_dataset(dataset_name=dataset_name) - examples = client.list_examples(dataset_id=str(dataset.id)) + examples = list(client.list_examples(dataset_id=dataset.id)) + if not examples: + raise ValueError(f"Dataset {dataset_name} has no example rows.") return wrapped_model, project_name, dataset, examples +def _prepare_run_on_dataset( + client: Client, + dataset_name: str, + llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY, + project_name: Optional[str], + evaluation: Optional[smith_eval.RunEvalConfig] = None, + tags: Optional[List[str]] = None, + input_mapper: Optional[Callable[[Dict], Any]] = None, + concurrency_level: int = 5, +) -> Tuple[MCF, str, List[Example], List[RunnableConfig]]: + project_name = project_name or name_generation.random_name() + wrapped_model, project_name, dataset, examples = _prepare_eval_run( + client, dataset_name, llm_or_chain_factory, project_name + ) + wrapped_model = _wrap_in_chain_factory(llm_or_chain_factory) + run_evaluators = _setup_evaluation( + wrapped_model, examples, evaluation, dataset.data_type + ) + _validate_example_inputs(examples[0], wrapped_model, input_mapper) + progress_bar = progress.ProgressBarCallback(len(examples)) + configs = [ + RunnableConfig( + callbacks=[ + LangChainTracer( + project_name=project_name, + client=client, + use_threading=False, + example_id=example.id, + ), + EvaluatorCallbackHandler( + evaluators=run_evaluators or [], + client=client, + max_workers=0, + example_id=example.id, + ), + progress_bar, + ], + tags=tags or [], + max_concurrency=concurrency_level, + ) + for example in examples + ] + return wrapped_model, project_name, examples, configs + + +def _collect_test_results( + examples: List[Example], + batch_results: List[Union[dict, str, LLMResult, ChatResult]], + configs: List[RunnableConfig], + project_name: str, +) -> TestResult: + wait_for_all_tracers() + all_feedback = {} + for c in configs: + for callback in cast(list, c["callbacks"]): + if isinstance(callback, EvaluatorCallbackHandler): + all_feedback.update(callback.logged_feedback) + results = {} + for example, output in zip(examples, batch_results): + feedback = all_feedback.get(str(example.id), []) + results[str(example.id)] = { + "output": output, + "input": example.inputs, + "feedback": feedback, + } + if example.outputs: + results[str(example.id)]["reference"] = example.outputs + return TestResult( + project_name=project_name, + results=results, + ) + + async def arun_on_dataset( client: Client, dataset_name: str, llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY, *, - evaluation: Optional[RunEvalConfig] = None, + evaluation: Optional[smith_eval.RunEvalConfig] = None, concurrency_level: int = 5, project_name: Optional[str] = None, verbose: bool = False, @@ -1227,7 +996,7 @@ async def arun_on_dataset( from langsmith import Client from langchain.chat_models import ChatOpenAI from langchain.chains import LLMChain - from langchain.smith import RunEvalConfig, arun_on_dataset + from langchain.smith import smith_eval.RunEvalConfig, arun_on_dataset # Chains may have memory. Passing in a constructor function lets the # evaluation framework avoid cross-contamination between runs. @@ -1240,12 +1009,12 @@ async def arun_on_dataset( return chain # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum) - evaluation_config = RunEvalConfig( + evaluation_config = smith_eval.RunEvalConfig( evaluators=[ "qa", # "Correctness" against a reference answer "embedding_distance", - RunEvalConfig.Criteria("helpfulness"), - RunEvalConfig.Criteria({ + smith_eval.RunEvalConfig.Criteria("helpfulness"), + smith_eval.RunEvalConfig.Criteria({ "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?" }), ] @@ -1286,7 +1055,7 @@ async def arun_on_dataset( return {"score": prediction == reference} - evaluation_config = RunEvalConfig( + evaluation_config = smith_eval.RunEvalConfig( custom_evaluators = [MyStringEvaluator()], ) @@ -1299,51 +1068,43 @@ async def arun_on_dataset( """ # noqa: E501 if kwargs: warnings.warn( - "The following arguments are deprecated and will " - "be removed in a future release: " + "The following arguments are deprecated and " + "will be removed in a future release: " f"{kwargs.keys()}.", DeprecationWarning, ) - wrapped_model, project_name, dataset, examples = _prepare_eval_run( - client, dataset_name, llm_or_chain_factory, project_name - ) - results = await _arun_on_examples( + wrapped_model, project_name, examples, configs = _prepare_run_on_dataset( client, - examples, - wrapped_model, - concurrency_level=concurrency_level, - project_name=project_name, - verbose=verbose, - tags=tags, - evaluation=evaluation, - input_mapper=input_mapper, - data_type=dataset.data_type, - ) - return TestResult( - project_name=project_name, - results=results, + dataset_name, + llm_or_chain_factory, + project_name, + evaluation, + tags, + input_mapper, + concurrency_level, ) - -def _handle_coroutine(coro: Coroutine) -> Any: - """ - Handles a coroutine from a sync context. - - Args: - coro (asyncio.coroutine): The coroutine to be handled. - - Returns: - any: The result of the executed coroutine. - """ - # Check if there's a running event loop - try: - loop = asyncio.get_event_loop() - except RuntimeError: # No event loop - return asyncio.run(coro) - if loop.is_running(): - return loop.run_until_complete(coro) - else: - return asyncio.run(coro) + batch_results = await runnable_utils.gather_with_concurrency( + configs[0].get("max_concurrency"), + *map( + functools.partial( + _arun_llm_or_chain, + llm_or_chain_factory=wrapped_model, + input_mapper=input_mapper, + ), + examples, + configs, + ), + ) + results = _collect_test_results(examples, batch_results, configs, project_name) + if verbose: + try: + agg_feedback = results.get_aggregate_feedback() + print("\n Eval quantiles:") + print(agg_feedback) + except Exception as e: + logger.debug(f"Failed to print aggregate feedback: {repr(e)}") + return results def run_on_dataset( @@ -1351,7 +1112,7 @@ def run_on_dataset( dataset_name: str, llm_or_chain_factory: MODEL_OR_CHAIN_FACTORY, *, - evaluation: Optional[RunEvalConfig] = None, + evaluation: Optional[smith_eval.RunEvalConfig] = None, concurrency_level: int = 5, project_name: Optional[str] = None, verbose: bool = False, @@ -1397,7 +1158,7 @@ def run_on_dataset( from langsmith import Client from langchain.chat_models import ChatOpenAI from langchain.chains import LLMChain - from langchain.smith import RunEvalConfig, run_on_dataset + from langchain.smith import smith_eval.RunEvalConfig, run_on_dataset # Chains may have memory. Passing in a constructor function lets the # evaluation framework avoid cross-contamination between runs. @@ -1410,12 +1171,12 @@ def run_on_dataset( return chain # Load off-the-shelf evaluators via config or the EvaluatorType (string or enum) - evaluation_config = RunEvalConfig( + evaluation_config = smith_eval.RunEvalConfig( evaluators=[ "qa", # "Correctness" against a reference answer "embedding_distance", - RunEvalConfig.Criteria("helpfulness"), - RunEvalConfig.Criteria({ + smith_eval.RunEvalConfig.Criteria("helpfulness"), + smith_eval.RunEvalConfig.Criteria({ "fifth-grader-score": "Do you have to be smarter than a fifth grader to answer this question?" }), ] @@ -1456,7 +1217,7 @@ def run_on_dataset( return {"score": prediction == reference} - evaluation_config = RunEvalConfig( + evaluation_config = smith_eval.RunEvalConfig( custom_evaluators = [MyStringEvaluator()], ) @@ -1474,37 +1235,35 @@ def run_on_dataset( f"{kwargs.keys()}.", DeprecationWarning, ) - wrapped_model, project_name, dataset, examples = _prepare_eval_run( - client, dataset_name, llm_or_chain_factory, project_name + wrapped_model, project_name, examples, configs = _prepare_run_on_dataset( + client, + dataset_name, + llm_or_chain_factory, + project_name, + evaluation, + tags, + input_mapper, + concurrency_level, ) - if concurrency_level in (0, 1): - results = _run_on_examples( - client, - examples, - wrapped_model, - project_name=project_name, - verbose=verbose, - tags=tags, - evaluation=evaluation, - input_mapper=input_mapper, - data_type=dataset.data_type, - ) - else: - # TODO: Use runnables and the batch method - coro = _arun_on_examples( - client, - examples, - wrapped_model, - concurrency_level=concurrency_level, - project_name=project_name, - verbose=verbose, - tags=tags, - evaluation=evaluation, - input_mapper=input_mapper, - data_type=dataset.data_type, + with runnable_config.get_executor_for_config(configs[0]) as executor: + batch_results = list( + executor.map( + functools.partial( + _run_llm_or_chain, + llm_or_chain_factory=wrapped_model, + input_mapper=input_mapper, + ), + examples, + configs, + ) ) - results = _handle_coroutine(coro) - return TestResult( - project_name=project_name, - results=results, - ) + + results = _collect_test_results(examples, batch_results, configs, project_name) + if verbose: + try: + agg_feedback = results.get_aggregate_feedback() + print("\n Eval quantiles:") + print(agg_feedback) + except Exception as e: + logger.debug(f"Failed to print aggregate feedback: {repr(e)}") + return results diff --git a/libs/langchain/langchain/smith/evaluation/string_run_evaluator.py b/libs/langchain/langchain/smith/evaluation/string_run_evaluator.py index d8133b19d8..1c14017673 100644 --- a/libs/langchain/langchain/smith/evaluation/string_run_evaluator.py +++ b/libs/langchain/langchain/smith/evaluation/string_run_evaluator.py @@ -148,13 +148,27 @@ class ChainStringRunMapper(StringRunMapper): def map(self, run: Run) -> Dict[str, str]: """Maps the Run to a dictionary.""" if not run.outputs: - raise ValueError(f"Run {run.id} has no outputs to evaluate.") + raise ValueError( + f"Run with ID {run.id} lacks outputs required for evaluation." + " Ensure the Run has valid outputs." + ) if self.input_key is not None and self.input_key not in run.inputs: - raise ValueError(f"Run {run.id} does not have input key {self.input_key}.") + raise ValueError( + f"Run with ID {run.id} is missing the expected input key" + f" '{self.input_key}'.\nAvailable input keys in this Run" + f" are: {run.inputs.keys()}.\nAdjust the evaluator's" + f" input_key or ensure your input data includes key" + f" '{self.input_key}'." + ) elif self.prediction_key is not None and self.prediction_key not in run.outputs: + available_keys = ", ".join(run.outputs.keys()) raise ValueError( - f"Run {run.id} does not have prediction key {self.prediction_key}." + f"Run with ID {run.id} doesn't have the expected prediction key" + f" '{self.prediction_key}'. Available prediction keys in this Run are:" + f" {available_keys}. Adjust the evaluator's prediction_key or" + " ensure the Run object's outputs the expected key." ) + else: input_ = self._get_key(run.inputs, self.input_key, "input") prediction = self._get_key(run.outputs, self.prediction_key, "prediction") diff --git a/libs/langchain/langchain/text_splitter.py b/libs/langchain/langchain/text_splitter.py index e804b93be9..2e5f7021f3 100644 --- a/libs/langchain/langchain/text_splitter.py +++ b/libs/langchain/langchain/text_splitter.py @@ -100,6 +100,7 @@ class TextSplitter(BaseDocumentTransformer, ABC): length_function: Callable[[str], int] = len, keep_separator: bool = False, add_start_index: bool = False, + strip_whitespace: bool = True, ) -> None: """Create a new TextSplitter. @@ -109,6 +110,8 @@ class TextSplitter(BaseDocumentTransformer, ABC): length_function: Function that measures the length of given chunks keep_separator: Whether to keep the separator in the chunks add_start_index: If `True`, includes chunk's start index in metadata + strip_whitespace: If `True`, strips whitespace from the start and end of + every document """ if chunk_overlap > chunk_size: raise ValueError( @@ -120,6 +123,7 @@ class TextSplitter(BaseDocumentTransformer, ABC): self._length_function = length_function self._keep_separator = keep_separator self._add_start_index = add_start_index + self._strip_whitespace = strip_whitespace @abstractmethod def split_text(self, text: str) -> List[str]: @@ -152,7 +156,8 @@ class TextSplitter(BaseDocumentTransformer, ABC): def _join_docs(self, docs: List[str], separator: str) -> Optional[str]: text = separator.join(docs) - text = text.strip() + if self._strip_whitespace: + text = text.strip() if text == "": return None else: @@ -622,6 +627,7 @@ class Language(str, Enum): LATEX = "latex" HTML = "html" SOL = "sol" + CSHARP = "csharp" class RecursiveCharacterTextSplitter(TextSplitter): @@ -997,6 +1003,43 @@ class RecursiveCharacterTextSplitter(TextSplitter): "<title", "", ] + elif language == Language.CSHARP: + return [ + "\ninterface ", + "\nenum ", + "\nimplements ", + "\ndelegate ", + "\nevent ", + # Split along class definitions + "\nclass ", + "\nabstract ", + # Split along method definitions + "\npublic ", + "\nprotected ", + "\nprivate ", + "\nstatic ", + "\nreturn ", + # Split along control flow statements + "\nif ", + "\ncontinue ", + "\nfor ", + "\nforeach ", + "\nwhile ", + "\nswitch ", + "\nbreak ", + "\ncase ", + "\nelse ", + # Split by exceptions + "\ntry ", + "\nthrow ", + "\nfinally ", + "\ncatch ", + # Split by the normal type of lines + "\n\n", + "\n", + " ", + "", + ] elif language == Language.SOL: return [ # Split along compiler information definitions @@ -1027,6 +1070,7 @@ class RecursiveCharacterTextSplitter(TextSplitter): " ", "", ] + else: raise ValueError( f"Language {language} is not supported! " @@ -1037,7 +1081,9 @@ class RecursiveCharacterTextSplitter(TextSplitter): class NLTKTextSplitter(TextSplitter): """Splitting text using NLTK package.""" - def __init__(self, separator: str = "\n\n", **kwargs: Any) -> None: + def __init__( + self, separator: str = "\n\n", language: str = "english", **kwargs: Any + ) -> None: """Initialize the NLTK splitter.""" super().__init__(**kwargs) try: @@ -1049,11 +1095,12 @@ class NLTKTextSplitter(TextSplitter): "NLTK is not installed, please install it with `pip install nltk`." ) self._separator = separator + self._language = language def split_text(self, text: str) -> List[str]: """Split incoming text and return chunks.""" # First we naively split the large input into a bunch of smaller ones. - splits = self._tokenizer(text) + splits = self._tokenizer(text, language=self._language) return self._merge_splits(splits, self._separator) diff --git a/libs/langchain/langchain/tools/base.py b/libs/langchain/langchain/tools/base.py index 9ad81033d5..69597cd903 100644 --- a/libs/langchain/langchain/tools/base.py +++ b/libs/langchain/langchain/tools/base.py @@ -592,7 +592,7 @@ class StructuredTool(BaseTool): None, partial(self.invoke, input, config, **kwargs) ) - return super().ainvoke(input, config, **kwargs) + return await super().ainvoke(input, config, **kwargs) # --- Tool --- diff --git a/libs/langchain/langchain/tools/sql_database/tool.py b/libs/langchain/langchain/tools/sql_database/tool.py index f60275bcaf..75f45c7b9e 100644 --- a/libs/langchain/langchain/tools/sql_database/tool.py +++ b/libs/langchain/langchain/tools/sql_database/tool.py @@ -93,7 +93,7 @@ class QuerySQLCheckerTool(BaseSQLDatabaseTool, BaseTool): name: str = "sql_db_query_checker" description: str = """ Use this tool to double check if your query is correct before executing it. - Always use this tool before executing a query with query_sql_db! + Always use this tool before executing a query with sql_db_query! """ @root_validator(pre=True) diff --git a/libs/langchain/langchain/utilities/redis.py b/libs/langchain/langchain/utilities/redis.py index a45391c8bc..605a611967 100644 --- a/libs/langchain/langchain/utilities/redis.py +++ b/libs/langchain/langchain/utilities/redis.py @@ -17,6 +17,10 @@ def _array_to_buffer(array: List[float], dtype: Any = np.float32) -> bytes: return np.array(array).astype(dtype).tobytes() +def _buffer_to_array(buffer: bytes, dtype: Any = np.float32) -> List[float]: + return np.frombuffer(buffer, dtype=dtype).tolist() + + class TokenEscaper: """ Escape punctuation within an input string. diff --git a/libs/langchain/langchain/utilities/sql_database.py b/libs/langchain/langchain/utilities/sql_database.py index 110f081d3c..13718c8c0c 100644 --- a/libs/langchain/langchain/utilities/sql_database.py +++ b/libs/langchain/langchain/utilities/sql_database.py @@ -9,6 +9,7 @@ from sqlalchemy import MetaData, Table, create_engine, inspect, select, text from sqlalchemy.engine import Engine from sqlalchemy.exc import ProgrammingError, SQLAlchemyError from sqlalchemy.schema import CreateTable +from sqlalchemy.types import NullType from langchain.utils import get_from_env @@ -314,6 +315,11 @@ class SQLDatabase: tables.append(self._custom_table_info[table.name]) continue + # Ignore JSON datatyped columns + for k, v in table.columns.items(): + if type(v.type) is NullType: + table._columns.remove(v) + # add create table command create_table = str(CreateTable(table).compile(self._engine)) table_info = f"{create_table.rstrip()}" @@ -384,6 +390,8 @@ class SQLDatabase: connection.exec_driver_sql(f"SET @@dataset_id='{self._schema}'") elif self.dialect == "mssql": pass + elif self.dialect == "trino": + connection.exec_driver_sql(f"USE {self._schema}") else: # postgresql and compatible dialects connection.exec_driver_sql(f"SET search_path TO {self._schema}") cursor = connection.execute(text(command)) diff --git a/libs/langchain/langchain/vectorstores/chroma.py b/libs/langchain/langchain/vectorstores/chroma.py index 76469357a2..706588202b 100644 --- a/libs/langchain/langchain/vectorstores/chroma.py +++ b/libs/langchain/langchain/vectorstores/chroma.py @@ -142,6 +142,7 @@ class Chroma(VectorStore): query_embeddings: Optional[List[List[float]]] = None, n_results: int = 4, where: Optional[Dict[str, str]] = None, + where_document: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> List[Document]: """Query the chroma collection.""" @@ -157,6 +158,7 @@ class Chroma(VectorStore): query_embeddings=query_embeddings, n_results=n_results, where=where, + where_document=where_document, **kwargs, ) @@ -264,6 +266,7 @@ class Chroma(VectorStore): embedding: List[float], k: int = DEFAULT_K, filter: Optional[Dict[str, str]] = None, + where_document: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> List[Document]: """Return docs most similar to embedding vector. @@ -275,7 +278,10 @@ class Chroma(VectorStore): List of Documents most similar to the query vector. """ results = self.__query_collection( - query_embeddings=embedding, n_results=k, where=filter + query_embeddings=embedding, + n_results=k, + where=filter, + where_document=where_document, ) return _results_to_docs(results) @@ -284,6 +290,7 @@ class Chroma(VectorStore): embedding: List[float], k: int = DEFAULT_K, filter: Optional[Dict[str, str]] = None, + where_document: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: """ @@ -300,7 +307,10 @@ class Chroma(VectorStore): Lower score represents more similarity. """ results = self.__query_collection( - query_embeddings=embedding, n_results=k, where=filter + query_embeddings=embedding, + n_results=k, + where=filter, + where_document=where_document, ) return _results_to_docs_and_scores(results) @@ -309,6 +319,7 @@ class Chroma(VectorStore): query: str, k: int = DEFAULT_K, filter: Optional[Dict[str, str]] = None, + where_document: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: """Run similarity search with Chroma with distance. @@ -325,12 +336,18 @@ class Chroma(VectorStore): """ if self._embedding_function is None: results = self.__query_collection( - query_texts=[query], n_results=k, where=filter + query_texts=[query], + n_results=k, + where=filter, + where_document=where_document, ) else: query_embedding = self._embedding_function.embed_query(query) results = self.__query_collection( - query_embeddings=[query_embedding], n_results=k, where=filter + query_embeddings=[query_embedding], + n_results=k, + where=filter, + where_document=where_document, ) return _results_to_docs_and_scores(results) @@ -374,6 +391,7 @@ class Chroma(VectorStore): fetch_k: int = 20, lambda_mult: float = 0.5, filter: Optional[Dict[str, str]] = None, + where_document: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance. @@ -398,6 +416,7 @@ class Chroma(VectorStore): query_embeddings=embedding, n_results=fetch_k, where=filter, + where_document=where_document, include=["metadatas", "documents", "distances", "embeddings"], ) mmr_selected = maximal_marginal_relevance( @@ -419,6 +438,7 @@ class Chroma(VectorStore): fetch_k: int = 20, lambda_mult: float = 0.5, filter: Optional[Dict[str, str]] = None, + where_document: Optional[Dict[str, str]] = None, **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance. @@ -445,7 +465,12 @@ class Chroma(VectorStore): embedding = self._embedding_function.embed_query(query) docs = self.max_marginal_relevance_search_by_vector( - embedding, k, fetch_k, lambda_mult=lambda_mult, filter=filter + embedding, + k, + fetch_k, + lambda_mult=lambda_mult, + filter=filter, + where_document=where_document, ) return docs @@ -472,7 +497,7 @@ class Chroma(VectorStore): offset: The offset to start returning results from. Useful for paging results with limit. Optional. where_document: A WhereDocument type dict used to filter by the documents. - E.g. `{$contains: {"text": "hello"}}`. Optional. + E.g. `{$contains: "hello"}`. Optional. include: A list of what to include in the results. Can contain `"embeddings"`, `"metadatas"`, `"documents"`. Ids are always included. diff --git a/libs/langchain/langchain/vectorstores/nucliadb.py b/libs/langchain/langchain/vectorstores/nucliadb.py new file mode 100644 index 0000000000..8ba9d4454c --- /dev/null +++ b/libs/langchain/langchain/vectorstores/nucliadb.py @@ -0,0 +1,159 @@ +import os +from typing import Any, Dict, Iterable, List, Optional, Type + +from langchain.embeddings.base import Embeddings +from langchain.schema.document import Document +from langchain.vectorstores.base import VST, VectorStore + +FIELD_TYPES = { + "f": "files", + "t": "texts", + "l": "links", +} + + +class NucliaDB(VectorStore): + """NucliaDB vector store.""" + + _config: Dict[str, Any] = {} + + def __init__( + self, + knowledge_box: str, + local: bool, + api_key: Optional[str] = None, + backend: Optional[str] = None, + ) -> None: + """Initialize the NucliaDB client. + + Args: + knowledge_box: the Knowledge Box id. + local: Whether to use a local NucliaDB instance or Nuclia Cloud + api_key: A contributor API key for the kb (needed when local is False) + backend: The backend url to use when local is True, defaults to + http://localhost:8080 + """ + try: + from nuclia.sdk import NucliaAuth + except ImportError: + raise ValueError( + "nuclia python package not found. " + "Please install it with `pip install nuclia`." + ) + self._config["LOCAL"] = local + zone = os.environ.get("NUCLIA_ZONE", "europe-1") + self._kb = knowledge_box + if local: + if not backend: + backend = "http://localhost:8080" + self._config["BACKEND"] = f"{backend}/api/v1" + self._config["TOKEN"] = None + NucliaAuth().nucliadb(url=backend) + NucliaAuth().kb(url=self.kb_url, interactive=False) + else: + self._config["BACKEND"] = f"https://{zone}.nuclia.cloud/api/v1" + self._config["TOKEN"] = api_key + NucliaAuth().kb( + url=self.kb_url, token=self._config["TOKEN"], interactive=False + ) + + @property + def is_local(self) -> str: + return self._config["LOCAL"] + + @property + def kb_url(self) -> str: + return f"{self._config['BACKEND']}/kb/{self._kb}" + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + """Upload texts to NucliaDB""" + ids = [] + from nuclia.sdk import NucliaResource + + factory = NucliaResource() + for i, text in enumerate(texts): + extra: Dict[str, Any] = {"metadata": ""} + if metadatas: + extra = {"metadata": metadatas[i]} + id = factory.create( + texts={"text": {"body": text}}, + extra=extra, + url=self.kb_url, + api_key=self._config["TOKEN"], + ) + ids.append(id) + return ids + + def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]: + if not ids: + return None + from nuclia.sdk import NucliaResource + + factory = NucliaResource() + results: List[bool] = [] + for id in ids: + try: + factory.delete(rid=id, url=self.kb_url, api_key=self._config["TOKEN"]) + results.append(True) + except ValueError: + results.append(False) + return all(results) + + def similarity_search( + self, query: str, k: int = 4, **kwargs: Any + ) -> List[Document]: + from nuclia.sdk import NucliaSearch + from nucliadb_models.search import FindRequest, ResourceProperties + + request = FindRequest( + query=query, + page_size=k, + show=[ResourceProperties.VALUES, ResourceProperties.EXTRA], + ) + search = NucliaSearch() + results = search.find( + query=request, url=self.kb_url, api_key=self._config["TOKEN"] + ) + paragraphs = [] + for resource in results.resources.values(): + for field in resource.fields.values(): + for paragraph_id, paragraph in field.paragraphs.items(): + info = paragraph_id.split("/") + field_type = FIELD_TYPES.get(info[1], None) + field_id = info[2] + if not field_type: + continue + value = getattr(resource.data, field_type, {}).get(field_id, None) + paragraphs.append( + { + "text": paragraph.text, + "metadata": { + "extra": getattr( + getattr(resource, "extra", {}), "metadata", None + ), + "value": value, + }, + "order": paragraph.order, + } + ) + sorted_paragraphs = sorted(paragraphs, key=lambda x: x["order"]) + return [ + Document(page_content=paragraph["text"], metadata=paragraph["metadata"]) + for paragraph in sorted_paragraphs + ] + + @classmethod + def from_texts( + cls: Type[VST], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> VST: + """Return VectorStore initialized from texts and embeddings.""" + raise NotImplementedError diff --git a/libs/langchain/langchain/vectorstores/pgvector.py b/libs/langchain/langchain/vectorstores/pgvector.py index 6b02fc19c0..6186d1d7e6 100644 --- a/libs/langchain/langchain/vectorstores/pgvector.py +++ b/libs/langchain/langchain/vectorstores/pgvector.py @@ -1,9 +1,11 @@ from __future__ import annotations +import asyncio import contextlib import enum import logging import uuid +from functools import partial from typing import ( TYPE_CHECKING, Any, @@ -17,6 +19,7 @@ from typing import ( Type, ) +import numpy as np import sqlalchemy from sqlalchemy import delete from sqlalchemy.dialects.postgresql import UUID @@ -26,6 +29,7 @@ from langchain.docstore.document import Document from langchain.embeddings.base import Embeddings from langchain.utils import get_from_dict_or_env from langchain.vectorstores.base import VectorStore +from langchain.vectorstores.utils import maximal_marginal_relevance if TYPE_CHECKING: from langchain.vectorstores._pgvector_data_models import CollectionStore @@ -54,6 +58,11 @@ class BaseModel(Base): uuid = sqlalchemy.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4) +def _results_to_docs(docs_and_scores: Any) -> List[Document]: + """Return docs from docs and scores.""" + return [doc for doc, _ in docs_and_scores] + + class PGVector(VectorStore): """`Postgres`/`PGVector` vector store. @@ -339,7 +348,7 @@ class PGVector(VectorStore): filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. Returns: - List of Documents most similar to the query and score for each + List of Documents most similar to the query and score for each. """ embedding = self.embedding_function.embed_query(query) docs = self.similarity_search_with_score_by_vector( @@ -349,16 +358,16 @@ class PGVector(VectorStore): @property def distance_strategy(self) -> Any: - if self._distance_strategy == "l2": + if self._distance_strategy == DistanceStrategy.EUCLIDEAN: return self.EmbeddingStore.embedding.l2_distance - elif self._distance_strategy == "cosine": + elif self._distance_strategy == DistanceStrategy.COSINE: return self.EmbeddingStore.embedding.cosine_distance - elif self._distance_strategy == "inner": + elif self._distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT: return self.EmbeddingStore.embedding.max_inner_product else: raise ValueError( f"Got unexpected value for distance: {self._distance_strategy}. " - f"Should be one of `l2`, `cosine`, `inner`." + f"Should be one of {', '.join([ds.value for ds in DistanceStrategy])}." ) def similarity_search_with_score_by_vector( @@ -367,6 +376,31 @@ class PGVector(VectorStore): k: int = 4, filter: Optional[dict] = None, ) -> List[Tuple[Document, float]]: + results = self.__query_collection(embedding=embedding, k=k, filter=filter) + + return self._results_to_docs_and_scores(results) + + def _results_to_docs_and_scores(self, results: Any) -> List[Tuple[Document, float]]: + """Return docs and scores from results.""" + docs = [ + ( + Document( + page_content=result.EmbeddingStore.document, + metadata=result.EmbeddingStore.cmetadata, + ), + result.distance if self.embedding_function is not None else None, + ) + for result in results + ] + return docs + + def __query_collection( + self, + embedding: List[float], + k: int = 4, + filter: Optional[Dict[str, str]] = None, + ) -> List[Any]: + """Query the collection.""" with Session(self._conn) as session: collection = self.get_collection(session) if not collection: @@ -410,18 +444,7 @@ class PGVector(VectorStore): .limit(k) .all() ) - - docs = [ - ( - Document( - page_content=result.EmbeddingStore.document, - metadata=result.EmbeddingStore.cmetadata, - ), - result.distance if self.embedding_function is not None else None, - ) - for result in results - ] - return docs + return results def similarity_search_by_vector( self, @@ -443,7 +466,7 @@ class PGVector(VectorStore): docs_and_scores = self.similarity_search_with_score_by_vector( embedding=embedding, k=k, filter=filter ) - return [doc for doc, _ in docs_and_scores] + return _results_to_docs(docs_and_scores) @classmethod def from_texts( @@ -640,3 +663,190 @@ class PGVector(VectorStore): f" for distance_strategy of {self._distance_strategy}." "Consider providing relevance_score_fn to PGVector constructor." ) + + def max_marginal_relevance_search_with_score_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: Optional[Dict[str, str]] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs selected using the maximal marginal relevance with score + to embedding vector. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + embedding: Embedding to look up documents similar to. + k (int): Number of Documents to return. Defaults to 4. + fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. + Defaults to 20. + lambda_mult (float): Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + + Returns: + List[Tuple[Document, float]]: List of Documents selected by maximal marginal + relevance to the query and score for each. + """ + results = self.__query_collection(embedding=embedding, k=fetch_k, filter=filter) + + embedding_list = [result.EmbeddingStore.embedding for result in results] + + mmr_selected = maximal_marginal_relevance( + np.array(embedding, dtype=np.float32), + embedding_list, + k=k, + lambda_mult=lambda_mult, + ) + + candidates = self._results_to_docs_and_scores(results) + + return [r for i, r in enumerate(candidates) if i in mmr_selected] + + def max_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: Optional[Dict[str, str]] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query (str): Text to look up documents similar to. + k (int): Number of Documents to return. Defaults to 4. + fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. + Defaults to 20. + lambda_mult (float): Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + + Returns: + List[Document]: List of Documents selected by maximal marginal relevance. + """ + embedding = self.embedding_function.embed_query(query) + return self.max_marginal_relevance_search_by_vector( + embedding, + k=k, + fetch_k=fetch_k, + lambda_mult=lambda_mult, + **kwargs, + ) + + def max_marginal_relevance_search_with_score( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: Optional[dict] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs selected using the maximal marginal relevance with score. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query (str): Text to look up documents similar to. + k (int): Number of Documents to return. Defaults to 4. + fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. + Defaults to 20. + lambda_mult (float): Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + + Returns: + List[Tuple[Document, float]]: List of Documents selected by maximal marginal + relevance to the query and score for each. + """ + embedding = self.embedding_function.embed_query(query) + docs = self.max_marginal_relevance_search_with_score_by_vector( + embedding=embedding, + k=k, + fetch_k=fetch_k, + lambda_mult=lambda_mult, + filter=filter, + **kwargs, + ) + return docs + + def max_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: Optional[Dict[str, str]] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance + to embedding vector. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + embedding (str): Text to look up documents similar to. + k (int): Number of Documents to return. Defaults to 4. + fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. + Defaults to 20. + lambda_mult (float): Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + filter (Optional[Dict[str, str]]): Filter by metadata. Defaults to None. + + Returns: + List[Document]: List of Documents selected by maximal marginal relevance. + """ + docs_and_scores = self.max_marginal_relevance_search_with_score_by_vector( + embedding, + k=k, + fetch_k=fetch_k, + lambda_mult=lambda_mult, + filter=filter, + **kwargs, + ) + + return _results_to_docs(docs_and_scores) + + async def amax_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: Optional[Dict[str, str]] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance.""" + + # This is a temporary workaround to make the similarity search + # asynchronous. The proper solution is to make the similarity search + # asynchronous in the vector store implementations. + func = partial( + self.max_marginal_relevance_search_by_vector, + embedding, + k=k, + fetch_k=fetch_k, + lambda_mult=lambda_mult, + filter=filter, + **kwargs, + ) + return await asyncio.get_event_loop().run_in_executor(None, func) diff --git a/libs/langchain/langchain/vectorstores/redis/__init__.py b/libs/langchain/langchain/vectorstores/redis/__init__.py index 6f05acb4ab..dc088facf4 100644 --- a/libs/langchain/langchain/vectorstores/redis/__init__.py +++ b/libs/langchain/langchain/vectorstores/redis/__init__.py @@ -1,4 +1,4 @@ -from .base import Redis +from .base import Redis, RedisVectorStoreRetriever from .filters import ( RedisFilter, RedisNum, @@ -6,4 +6,11 @@ from .filters import ( RedisText, ) -__all__ = ["Redis", "RedisFilter", "RedisTag", "RedisText", "RedisNum"] +__all__ = [ + "Redis", + "RedisFilter", + "RedisTag", + "RedisText", + "RedisNum", + "RedisVectorStoreRetriever", +] diff --git a/libs/langchain/langchain/vectorstores/redis/base.py b/libs/langchain/langchain/vectorstores/redis/base.py index a09ba44cba..320c6730e3 100644 --- a/libs/langchain/langchain/vectorstores/redis/base.py +++ b/libs/langchain/langchain/vectorstores/redis/base.py @@ -17,8 +17,10 @@ from typing import ( Tuple, Type, Union, + cast, ) +import numpy as np import yaml from langchain._api import deprecated @@ -30,6 +32,7 @@ from langchain.docstore.document import Document from langchain.embeddings.base import Embeddings from langchain.utilities.redis import ( _array_to_buffer, + _buffer_to_array, check_redis_module_exist, get_client, ) @@ -39,6 +42,7 @@ from langchain.vectorstores.redis.constants import ( REDIS_REQUIRED_MODULES, REDIS_TAG_SEPARATOR, ) +from langchain.vectorstores.utils import maximal_marginal_relevance logger = logging.getLogger(__name__) @@ -370,6 +374,11 @@ class Redis(VectorStore): if "generate" in kwargs: kwargs.pop("generate") + # see if the user specified keys + keys = None + if "keys" in kwargs: + keys = kwargs.pop("keys") + # Name of the search index if not given if not index_name: index_name = uuid.uuid4().hex @@ -418,7 +427,7 @@ class Redis(VectorStore): instance._create_index(dim=len(embeddings[0])) # Add data to Redis - keys = instance.add_texts(texts, metadatas, embeddings) + keys = instance.add_texts(texts, metadatas, embeddings, keys=keys) return instance, keys @classmethod @@ -803,8 +812,10 @@ class Redis(VectorStore): + "score_threshold will be removed in a future release.", ) + query_embedding = self._embeddings.embed_query(query) + redis_query, params_dict = self._prepare_query( - query, + query_embedding, k=k, filter=filter, with_metadata=return_metadata, @@ -858,13 +869,48 @@ class Redis(VectorStore): Defaults to None. return_metadata (bool, optional): Whether to return metadata. Defaults to True. - distance_threshold (Optional[float], optional): Distance threshold - for vector distance from query vector. Defaults to None. + distance_threshold (Optional[float], optional): Maximum vector distance + between selected documents and the query vector. Defaults to None. Returns: List[Document]: A list of documents that are most similar to the query text. + """ + query_embedding = self._embeddings.embed_query(query) + return self.similarity_search_by_vector( + query_embedding, + k=k, + filter=filter, + return_metadata=return_metadata, + distance_threshold=distance_threshold, + **kwargs, + ) + def similarity_search_by_vector( + self, + embedding: List[float], + k: int = 4, + filter: Optional[RedisFilterExpression] = None, + return_metadata: bool = True, + distance_threshold: Optional[float] = None, + **kwargs: Any, + ) -> List[Document]: + """Run similarity search between a query vector and the indexed vectors. + + Args: + embedding (List[float]): The query vector for which to find similar + documents. + k (int): The number of documents to return. Default is 4. + filter (RedisFilterExpression, optional): Optional metadata filter. + Defaults to None. + return_metadata (bool, optional): Whether to return metadata. + Defaults to True. + distance_threshold (Optional[float], optional): Maximum vector distance + between selected documents and the query vector. Defaults to None. + + Returns: + List[Document]: A list of documents that are most similar to the query + text. """ try: import redis @@ -884,7 +930,7 @@ class Redis(VectorStore): ) redis_query, params_dict = self._prepare_query( - query, + embedding, k=k, filter=filter, distance_threshold=distance_threshold, @@ -920,6 +966,74 @@ class Redis(VectorStore): ) return docs + def max_marginal_relevance_search( + self, + query: str, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + filter: Optional[RedisFilterExpression] = None, + return_metadata: bool = True, + distance_threshold: Optional[float] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance. + + Maximal marginal relevance optimizes for similarity to query AND diversity + among selected documents. + + Args: + query (str): Text to look up documents similar to. + k (int): Number of Documents to return. Defaults to 4. + fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. + lambda_mult (float): Number between 0 and 1 that determines the degree + of diversity among the results with 0 corresponding + to maximum diversity and 1 to minimum diversity. + Defaults to 0.5. + filter (RedisFilterExpression, optional): Optional metadata filter. + Defaults to None. + return_metadata (bool, optional): Whether to return metadata. + Defaults to True. + distance_threshold (Optional[float], optional): Maximum vector distance + between selected documents and the query vector. Defaults to None. + + Returns: + List[Document]: A list of Documents selected by maximal marginal relevance. + """ + # Embed the query + query_embedding = self._embeddings.embed_query(query) + + # Fetch the initial documents + prefetch_docs = self.similarity_search_by_vector( + query_embedding, + k=fetch_k, + filter=filter, + return_metadata=return_metadata, + distance_threshold=distance_threshold, + **kwargs, + ) + prefetch_ids = [doc.metadata["id"] for doc in prefetch_docs] + + # Get the embeddings for the fetched documents + prefetch_embeddings = [ + _buffer_to_array( + cast( + bytes, + self.client.hget(prefetch_id, self._schema.content_vector_key), + ), + dtype=self._schema.vector_dtype, + ) + for prefetch_id in prefetch_ids + ] + + # Select documents using maximal marginal relevance + selected_indices = maximal_marginal_relevance( + np.array(query_embedding), prefetch_embeddings, lambda_mult=lambda_mult, k=k + ) + selected_docs = [prefetch_docs[i] for i in selected_indices] + + return selected_docs + def _collect_metadata(self, result: "Document") -> Dict[str, Any]: """Collect metadata from Redis. @@ -952,19 +1066,16 @@ class Redis(VectorStore): def _prepare_query( self, - query: str, + query_embedding: List[float], k: int = 4, filter: Optional[RedisFilterExpression] = None, distance_threshold: Optional[float] = None, with_metadata: bool = True, with_distance: bool = False, ) -> Tuple["Query", Dict[str, Any]]: - # Creates embedding vector from user query - embedding = self._embeddings.embed_query(query) - # Creates Redis query params_dict: Dict[str, Union[str, bytes, float]] = { - "vector": _array_to_buffer(embedding, self._schema.vector_dtype), + "vector": _array_to_buffer(query_embedding, self._schema.vector_dtype), } # prepare return fields including score diff --git a/libs/langchain/langchain/vectorstores/redis/filters.py b/libs/langchain/langchain/vectorstores/redis/filters.py index 0f6608bae8..633c8f4073 100644 --- a/libs/langchain/langchain/vectorstores/redis/filters.py +++ b/libs/langchain/langchain/vectorstores/redis/filters.py @@ -1,5 +1,6 @@ from enum import Enum from functools import wraps +from numbers import Number from typing import Any, Callable, Dict, List, Optional, Union from langchain.utilities.redis import TokenEscaper @@ -56,14 +57,15 @@ class RedisFilterField: if operator not in self.OPERATORS: raise ValueError( f"Operator {operator} not supported by {self.__class__.__name__}. " - + f"Supported operators are {self.OPERATORS.values()}" + + f"Supported operators are {self.OPERATORS.values()}." ) if not isinstance(val, val_type): raise TypeError( f"Right side argument passed to operator {self.OPERATORS[operator]} " f"with left side " - f"argument {self.__class__.__name__} must be of type {val_type}" + f"argument {self.__class__.__name__} must be of type {val_type}, " + f"received value {val}" ) self._value = val self._operator = operator @@ -181,12 +183,12 @@ class RedisNum(RedisFilterField): RedisFilterOperator.GE: ">=", } OPERATOR_MAP: Dict[RedisFilterOperator, str] = { - RedisFilterOperator.EQ: "@%s:[%i %i]", - RedisFilterOperator.NE: "(-@%s:[%i %i])", - RedisFilterOperator.GT: "@%s:[(%i +inf]", - RedisFilterOperator.LT: "@%s:[-inf (%i]", - RedisFilterOperator.GE: "@%s:[%i +inf]", - RedisFilterOperator.LE: "@%s:[-inf %i]", + RedisFilterOperator.EQ: "@%s:[%f %f]", + RedisFilterOperator.NE: "(-@%s:[%f %f])", + RedisFilterOperator.GT: "@%s:[(%f +inf]", + RedisFilterOperator.LT: "@%s:[-inf (%f]", + RedisFilterOperator.GE: "@%s:[%f +inf]", + RedisFilterOperator.LE: "@%s:[-inf %f]", } def __str__(self) -> str: @@ -210,83 +212,83 @@ class RedisNum(RedisFilterField): return self.OPERATOR_MAP[self._operator] % (self._field, self._value) @check_operator_misuse - def __eq__(self, other: int) -> "RedisFilterExpression": + def __eq__(self, other: Union[int, float]) -> "RedisFilterExpression": """Create a Numeric equality filter expression Args: - other (int): The value to filter on. + other (Number): The value to filter on. Example: >>> from langchain.vectorstores.redis import RedisNum >>> filter = RedisNum("zipcode") == 90210 """ - self._set_value(other, int, RedisFilterOperator.EQ) + self._set_value(other, Number, RedisFilterOperator.EQ) return RedisFilterExpression(str(self)) @check_operator_misuse - def __ne__(self, other: int) -> "RedisFilterExpression": + def __ne__(self, other: Union[int, float]) -> "RedisFilterExpression": """Create a Numeric inequality filter expression Args: - other (int): The value to filter on. + other (Number): The value to filter on. Example: >>> from langchain.vectorstores.redis import RedisNum >>> filter = RedisNum("zipcode") != 90210 """ - self._set_value(other, int, RedisFilterOperator.NE) + self._set_value(other, Number, RedisFilterOperator.NE) return RedisFilterExpression(str(self)) - def __gt__(self, other: int) -> "RedisFilterExpression": + def __gt__(self, other: Union[int, float]) -> "RedisFilterExpression": """Create a RedisNumeric greater than filter expression Args: - other (int): The value to filter on. + other (Number): The value to filter on. Example: >>> from langchain.vectorstores.redis import RedisNum >>> filter = RedisNum("age") > 18 """ - self._set_value(other, int, RedisFilterOperator.GT) + self._set_value(other, Number, RedisFilterOperator.GT) return RedisFilterExpression(str(self)) - def __lt__(self, other: int) -> "RedisFilterExpression": + def __lt__(self, other: Union[int, float]) -> "RedisFilterExpression": """Create a Numeric less than filter expression Args: - other (int): The value to filter on. + other (Number): The value to filter on. Example: >>> from langchain.vectorstores.redis import RedisNum >>> filter = RedisNum("age") < 18 """ - self._set_value(other, int, RedisFilterOperator.LT) + self._set_value(other, Number, RedisFilterOperator.LT) return RedisFilterExpression(str(self)) - def __ge__(self, other: int) -> "RedisFilterExpression": + def __ge__(self, other: Union[int, float]) -> "RedisFilterExpression": """Create a Numeric greater than or equal to filter expression Args: - other (int): The value to filter on. + other (Number): The value to filter on. Example: >>> from langchain.vectorstores.redis import RedisNum >>> filter = RedisNum("age") >= 18 """ - self._set_value(other, int, RedisFilterOperator.GE) + self._set_value(other, Number, RedisFilterOperator.GE) return RedisFilterExpression(str(self)) - def __le__(self, other: int) -> "RedisFilterExpression": + def __le__(self, other: Union[int, float]) -> "RedisFilterExpression": """Create a Numeric less than or equal to filter expression Args: - other (int): The value to filter on. + other (Number): The value to filter on. Example: >>> from langchain.vectorstores.redis import RedisNum >>> filter = RedisNum("age") <= 18 """ - self._set_value(other, int, RedisFilterOperator.LE) + self._set_value(other, Number, RedisFilterOperator.LE) return RedisFilterExpression(str(self)) diff --git a/libs/langchain/langchain/vectorstores/redis/schema.py b/libs/langchain/langchain/vectorstores/redis/schema.py index 1ecd921928..79833a94bc 100644 --- a/libs/langchain/langchain/vectorstores/redis/schema.py +++ b/libs/langchain/langchain/vectorstores/redis/schema.py @@ -1,3 +1,5 @@ +from __future__ import annotations + import os from enum import Enum from pathlib import Path @@ -5,19 +7,19 @@ from typing import Any, Dict, List, Optional, Union import numpy as np import yaml - -# ignore type error here as it's a redis-py type problem -from redis.commands.search.field import ( # type: ignore - NumericField, - TagField, - TextField, - VectorField, -) -from typing_extensions import Literal +from typing_extensions import TYPE_CHECKING, Literal from langchain.pydantic_v1 import BaseModel, Field, validator from langchain.vectorstores.redis.constants import REDIS_VECTOR_DTYPE_MAP +if TYPE_CHECKING: + from redis.commands.search.field import ( # type: ignore + NumericField, + TagField, + TextField, + VectorField, + ) + class RedisDistanceMetric(str, Enum): l2 = "L2" @@ -38,6 +40,8 @@ class TextFieldSchema(RedisField): sortable: Optional[bool] = False def as_field(self) -> TextField: + from redis.commands.search.field import TextField # type: ignore + return TextField( self.name, weight=self.weight, @@ -55,6 +59,8 @@ class TagFieldSchema(RedisField): sortable: Optional[bool] = False def as_field(self) -> TagField: + from redis.commands.search.field import TagField # type: ignore + return TagField( self.name, separator=self.separator, @@ -69,6 +75,8 @@ class NumericFieldSchema(RedisField): sortable: Optional[bool] = False def as_field(self) -> NumericField: + from redis.commands.search.field import NumericField # type: ignore + return NumericField(self.name, sortable=self.sortable, no_index=self.no_index) @@ -97,6 +105,8 @@ class FlatVectorField(RedisVectorField): block_size: int = Field(default=1000) def as_field(self) -> VectorField: + from redis.commands.search.field import VectorField # type: ignore + return VectorField( self.name, self.algorithm, @@ -118,6 +128,8 @@ class HNSWVectorField(RedisVectorField): epsilon: float = Field(default=0.8) def as_field(self) -> VectorField: + from redis.commands.search.field import VectorField # type: ignore + return VectorField( self.name, self.algorithm, diff --git a/libs/langchain/langchain/vectorstores/supabase.py b/libs/langchain/langchain/vectorstores/supabase.py index d911965346..71b7c2cc8b 100644 --- a/libs/langchain/langchain/vectorstores/supabase.py +++ b/libs/langchain/langchain/vectorstores/supabase.py @@ -168,10 +168,8 @@ class SupabaseVectorStore(VectorStore): filter: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> List[Document]: - vectors = self._embedding.embed_documents([query]) - return self.similarity_search_by_vector( - vectors[0], k=k, filter=filter, **kwargs - ) + vector = self._embedding.embed_query(query) + return self.similarity_search_by_vector(vector, k=k, filter=filter, **kwargs) def similarity_search_by_vector( self, @@ -195,24 +193,37 @@ class SupabaseVectorStore(VectorStore): filter: Optional[Dict[str, Any]] = None, **kwargs: Any, ) -> List[Tuple[Document, float]]: - vectors = self._embedding.embed_documents([query]) + vector = self._embedding.embed_query(query) return self.similarity_search_by_vector_with_relevance_scores( - vectors[0], k=k, filter=filter + vector, k=k, filter=filter ) def match_args( - self, query: List[float], k: int, filter: Optional[Dict[str, Any]] + self, query: List[float], filter: Optional[Dict[str, Any]] ) -> Dict[str, Any]: - ret = dict(query_embedding=query, match_count=k) + ret: Dict[str, Any] = dict(query_embedding=query) if filter: ret["filter"] = filter return ret def similarity_search_by_vector_with_relevance_scores( - self, query: List[float], k: int, filter: Optional[Dict[str, Any]] = None + self, + query: List[float], + k: int, + filter: Optional[Dict[str, Any]] = None, + postgrest_filter: Optional[str] = None, ) -> List[Tuple[Document, float]]: - match_documents_params = self.match_args(query, k, filter) - res = self._client.rpc(self.query_name, match_documents_params).execute() + match_documents_params = self.match_args(query, filter) + query_builder = self._client.rpc(self.query_name, match_documents_params) + + if postgrest_filter: + query_builder.params = query_builder.params.set( + "and", f"({postgrest_filter})" + ) + + query_builder.params = query_builder.params.set("limit", k) + + res = query_builder.execute() match_result = [ ( @@ -229,10 +240,23 @@ class SupabaseVectorStore(VectorStore): return match_result def similarity_search_by_vector_returning_embeddings( - self, query: List[float], k: int, filter: Optional[Dict[str, Any]] = None + self, + query: List[float], + k: int, + filter: Optional[Dict[str, Any]] = None, + postgrest_filter: Optional[str] = None, ) -> List[Tuple[Document, float, np.ndarray[np.float32, Any]]]: - match_documents_params = self.match_args(query, k, filter) - res = self._client.rpc(self.query_name, match_documents_params).execute() + match_documents_params = self.match_args(query, filter) + query_builder = self._client.rpc(self.query_name, match_documents_params) + + if postgrest_filter: + query_builder.params = query_builder.params.set( + "and", f"({postgrest_filter})" + ) + + query_builder.params = query_builder.params.set("limit", k) + + res = query_builder.execute() match_result = [ ( @@ -407,9 +431,9 @@ class SupabaseVectorStore(VectorStore): $$; ``` """ - embedding = self._embedding.embed_documents([query]) + embedding = self._embedding.embed_query(query) docs = self.max_marginal_relevance_search_by_vector( - embedding[0], k, fetch_k, lambda_mult=lambda_mult + embedding, k, fetch_k, lambda_mult=lambda_mult ) return docs diff --git a/libs/langchain/langchain/vectorstores/vearch.py b/libs/langchain/langchain/vectorstores/vearch.py new file mode 100644 index 0000000000..99706d2e98 --- /dev/null +++ b/libs/langchain/langchain/vectorstores/vearch.py @@ -0,0 +1,401 @@ +from __future__ import annotations + +import os +import time +import uuid +from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Type + +import numpy as np + +from langchain.docstore.document import Document +from langchain.embeddings.base import Embeddings +from langchain.vectorstores.base import VectorStore + +if TYPE_CHECKING: + import vearch +DEFAULT_TOPN = 4 + + +class VearchDb(VectorStore): + _DEFAULT_TABLE_NAME = "langchain_vearch" + + def __init__( + self, + embedding_function: Embeddings, + table_name: str = _DEFAULT_TABLE_NAME, + metadata_path: Optional[str] = None, + **kwargs: Any, + ) -> None: + """Initialize vearch vector store""" + try: + import vearch + except ImportError: + raise ValueError( + "Could not import vearch python package. " + "Please install it with `pip install vearch`." + ) + + if metadata_path is None: + metadata_path = os.getcwd().replace("\\", "/") + if not os.path.isdir(metadata_path): + os.makedirs(metadata_path) + log_path = os.path.join(metadata_path, "log") + if not os.path.isdir(log_path): + os.makedirs(log_path) + self.vearch_engine = vearch.Engine(metadata_path, log_path) + + if not table_name: + table_name = self._DEFAULT_TABLE_NAME + table_name += "_" + table_name += str(uuid.uuid4()).split("-")[-1] + self.using_table_name = table_name + self.using_metapath = metadata_path + self.embedding_func = embedding_function + + @property + def embeddings(self) -> Optional[Embeddings]: + return self.embedding_func + + @classmethod + def from_documents( + cls: Type[VearchDb], + documents: List[Document], + embedding: Embeddings, + table_name: str = "langchain_vearch", + metadata_path: Optional[str] = None, + **kwargs: Any, + ) -> VearchDb: + """Return Vearch VectorStore""" + + texts = [d.page_content for d in documents] + metadatas = [d.metadata for d in documents] + + return cls.from_texts( + texts=texts, + embedding=embedding, + metadatas=metadatas, + table_name=table_name, + metadata_path=metadata_path, + **kwargs, + ) + + @classmethod + def from_texts( + cls: Type[VearchDb], + texts: List[str], + embedding: Embeddings, + metadatas: Optional[List[dict]] = None, + table_name: str = _DEFAULT_TABLE_NAME, + metadata_path: Optional[str] = None, + **kwargs: Any, + ) -> VearchDb: + """Return Vearch VectorStore""" + + vearch_db = cls( + embedding_function=embedding, + table_name=table_name, + metadata_path=metadata_path, + ) + vearch_db.add_texts(texts=texts, metadatas=metadatas) + return vearch_db + + def _create_table( + self, + dim: int = 1024, + filed_list: List[dict] = [ + {"filed": "text", "type": "str"}, + {"filed": "metadata", "type": "str"}, + ], + ) -> int: + """ + Create VectorStore Table + Args: + dim:dimension of vector + fileds_list: the filed you want to store + Return: + code,0 for success,1 for failed + """ + type_dict = {"int": vearch.dataType.INT, "str": vearch.dataType.STRING} + engine_info = { + "index_size": 10000, + "retrieval_type": "IVFPQ", + "retrieval_param": {"ncentroids": 2048, "nsubvector": 32}, + } + fields = [ + vearch.GammaFieldInfo(fi["filed"], type_dict[fi["type"]]) + for fi in filed_list + ] + vector_field = vearch.GammaVectorInfo( + name="text_embedding", + type=vearch.dataType.VECTOR, + is_index=True, + dimension=dim, + model_id="", + store_type="MemoryOnly", + store_param={"cache_size": 10000}, + has_source=False, + ) + response_code = self.vearch_engine.create_table( + engine_info, + name=self.using_table_name, + fields=fields, + vector_field=vector_field, + ) + return response_code + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + **kwargs: Any, + ) -> List[str]: + """ + Returns: + List of ids from adding the texts into the vectorstore. + """ + embeddings = None + if self.embedding_func is not None: + embeddings = self.embedding_func.embed_documents(list(texts)) + table_path = os.path.join( + self.using_metapath, self.using_table_name + ".schema" + ) + if not os.path.exists(table_path): + if embeddings is None: + raise ValueError("embeddings is None") + dim = len(embeddings[0]) + response_code = self._create_table(dim) + if response_code: + raise ValueError("create table failed!!!") + if embeddings is not None and metadatas is not None: + doc_items = [] + for text, metadata, embed in zip(texts, metadatas, embeddings): + profiles: dict[str, Any] = {} + profiles["text"] = text + profiles["metadata"] = metadata["source"] + profiles["text_embedding"] = embed + doc_items.append(profiles) + + docid = self.vearch_engine.add(doc_items) + t_time = 0 + while len(docid) != len(embeddings): + time.sleep(0.5) + if t_time > 6: + break + t_time += 1 + self.vearch_engine.dump() + return docid + + def _load(self) -> None: + """ + load vearch engine + """ + self.vearch_engine.load() + + @classmethod + def load_local( + cls, + embedding: Embeddings, + table_name: str = _DEFAULT_TABLE_NAME, + metadata_path: Optional[str] = None, + **kwargs: Any, + ) -> VearchDb: + """Load the local specified table. + Returns: + Success or failure of loading the local specified table + """ + if not metadata_path: + raise ValueError("No metadata path!!!") + if not table_name: + raise ValueError("No table name!!!") + table_path = os.path.join(metadata_path, table_name + ".schema") + if not os.path.exists(table_path): + raise ValueError("vearch vectorbase table not exist!!!") + vearch_db = cls( + embedding_function=embedding, + table_name=table_name, + metadata_path=metadata_path, + ) + vearch_db._load() + return vearch_db + + def similarity_search( + self, + query: str, + k: int = DEFAULT_TOPN, + **kwargs: Any, + ) -> List[Document]: + """ + Return docs most similar to query. + + """ + if self.vearch_engine is None: + raise ValueError("Vearch engine is None!!!") + if self.embedding_func is None: + raise ValueError("embedding_func is None!!!") + embeddings = self.embedding_func.embed_query(query) + docs = self.similarity_search_by_vector(embeddings, k) + return docs + + def similarity_search_by_vector( + self, + embedding: List[float], + k: int = DEFAULT_TOPN, + **kwargs: Any, + ) -> List[Document]: + """The most k similar documents and scores of the specified query. + + Args: + embeddings: embedding vector of the query. + k: The k most similar documents to the text query. + min_score: the score of similar documents to the text query + Returns: + The k most similar documents to the specified text query. + 0 is dissimilar, 1 is the most similar. + """ + query_data = { + "vector": [ + { + "field": "text_embedding", + "feature": np.array(embedding), + } + ], + "fields": [], + "is_brute_search": 1, + "retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20}, + "topn": k, + } + query_result = self.vearch_engine.search(query_data) + docs = [] + for item in query_result[0]["result_items"]: + content = "" + meta_data = {} + for item_key in item: + if item_key == "text": + content = item[item_key] + continue + if item_key == "metadata": + meta_data["source"] = item[item_key] + continue + docs.append(Document(page_content=content, metadata=meta_data)) + return docs + + def similarity_search_with_score( + self, + query: str, + k: int = DEFAULT_TOPN, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """The most k similar documents and scores of the specified query. + + Args: + embeddings: embedding vector of the query. + k: The k most similar documents to the text query. + min_score: the score of similar documents to the text query + Returns: + The k most similar documents to the specified text query. + 0 is dissimilar, 1 is the most similar. + """ + if self.embedding_func is None: + raise ValueError("embedding_func is None!!!") + embeddings = self.embedding_func.embed_query(query) + query_data = { + "vector": [ + { + "field": "text_embedding", + "feature": np.array(embeddings), + } + ], + "fields": [], + "is_brute_search": 1, + "retrieval_param": {"metric_type": "InnerProduct", "nprobe": 20}, + "topn": k, + } + query_result = self.vearch_engine.search(query_data) + results: List[Tuple[Document, float]] = [] + for item in query_result[0]["result_items"]: + content = "" + meta_data = {} + for item_key in item: + if item_key == "text": + content = item[item_key] + continue + if item_key == "metadata": + meta_data["source"] = item[item_key] + continue + if item_key == "score": + score = item[item_key] + continue + tmp_res = (Document(page_content=content, metadata=meta_data), score) + results.append(tmp_res) + return results + + def _similarity_search_with_relevance_scores( + self, + query: str, + k: int = 4, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + return self.similarity_search_with_score(query, k, **kwargs) + + def delete( + self, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> Optional[bool]: + """Delete the documents which have the specified ids. + + Args: + ids: The ids of the embedding vectors. + **kwargs: Other keyword arguments that subclasses might use. + Returns: + Optional[bool]: True if deletion is successful. + False otherwise, None if not implemented. + """ + if self.vearch_engine is None: + raise ValueError("Verach Engine is None!!!") + ret: Optional[bool] = None + tmp_res = [] + if ids is None or ids.__len__() == 0: + return ret + for _id in ids: + ret = self.vearch_engine.del_doc(_id) + tmp_res.append(ret) + ret = all(i == 0 for i in tmp_res) + return ret + + def get( + self, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> Dict[str, Document]: + """Return docs according ids. + + Args: + ids: The ids of the embedding vectors. + Returns: + Documents which satisfy the input conditions. + """ + + if self.vearch_engine is None: + raise ValueError("vearch engine is None!!!") + results: Dict[str, Document] = {} + if ids is None or ids.__len__() == 0: + return results + for id in ids: + docs_detail = self.vearch_engine.get_doc_by_id(id) + if docs_detail == {}: + continue + + content = "" + meta_info = {} + for field in docs_detail: + if field == "text": + content = docs_detail[field] + continue + elif field == "metadata": + meta_info["source"] = docs_detail[field] + continue + results[docs_detail["_id"]] = Document( + page_content=content, metadata=meta_info + ) + return results diff --git a/libs/langchain/langchain/vectorstores/vectara.py b/libs/langchain/langchain/vectorstores/vectara.py index 457511b104..3e8a2549e2 100644 --- a/libs/langchain/langchain/vectorstores/vectara.py +++ b/libs/langchain/langchain/vectorstores/vectara.py @@ -396,8 +396,12 @@ class Vectara(VectorStore): vectara_api_key=api_key, ) """ - # Note: Vectara generates its own embeddings, so we ignore the provided - # embeddings (required by interface) + # Notes: + # * Vectara generates its own embeddings, so we ignore the provided + # embeddings (required by interface) + # * when metadatas[] are provided they are associated with each "part" + # in Vectara. doc_metadata can be used to provide additional metadata + # for the document itself (applies to all "texts" in this call) doc_metadata = kwargs.pop("doc_metadata", {}) vectara = cls(**kwargs) vectara.add_texts(texts, metadatas, doc_metadata=doc_metadata, **kwargs) diff --git a/libs/langchain/poetry.lock b/libs/langchain/poetry.lock index 17756f0f68..399dcc2cab 100644 --- a/libs/langchain/poetry.lock +++ b/libs/langchain/poetry.lock @@ -3561,7 +3561,6 @@ optional = false python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*" files = [ {file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"}, - {file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"}, ] [[package]] @@ -5892,67 +5891,77 @@ files = [ [[package]] name = "pillow" -version = "10.0.0" +version = "9.5.0" description = "Python Imaging Library (Fork)" optional = true -python-versions = ">=3.8" +python-versions = ">=3.7" files = [ - {file = "Pillow-10.0.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:1f62406a884ae75fb2f818694469519fb685cc7eaff05d3451a9ebe55c646891"}, - {file = "Pillow-10.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d5db32e2a6ccbb3d34d87c87b432959e0db29755727afb37290e10f6e8e62614"}, - {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:edf4392b77bdc81f36e92d3a07a5cd072f90253197f4a52a55a8cec48a12483b"}, - {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:520f2a520dc040512699f20fa1c363eed506e94248d71f85412b625026f6142c"}, - {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:8c11160913e3dd06c8ffdb5f233a4f254cb449f4dfc0f8f4549eda9e542c93d1"}, - {file = "Pillow-10.0.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:a74ba0c356aaa3bb8e3eb79606a87669e7ec6444be352870623025d75a14a2bf"}, - {file = "Pillow-10.0.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d5d0dae4cfd56969d23d94dc8e89fb6a217be461c69090768227beb8ed28c0a3"}, - {file = "Pillow-10.0.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:22c10cc517668d44b211717fd9775799ccec4124b9a7f7b3635fc5386e584992"}, - {file = "Pillow-10.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:dffe31a7f47b603318c609f378ebcd57f1554a3a6a8effbc59c3c69f804296de"}, - {file = "Pillow-10.0.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:9fb218c8a12e51d7ead2a7c9e101a04982237d4855716af2e9499306728fb485"}, - {file = "Pillow-10.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:d35e3c8d9b1268cbf5d3670285feb3528f6680420eafe35cccc686b73c1e330f"}, - {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3ed64f9ca2f0a95411e88a4efbd7a29e5ce2cea36072c53dd9d26d9c76f753b3"}, - {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0b6eb5502f45a60a3f411c63187db83a3d3107887ad0d036c13ce836f8a36f1d"}, - {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:c1fbe7621c167ecaa38ad29643d77a9ce7311583761abf7836e1510c580bf3dd"}, - {file = "Pillow-10.0.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:cd25d2a9d2b36fcb318882481367956d2cf91329f6892fe5d385c346c0649629"}, - {file = "Pillow-10.0.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:3b08d4cc24f471b2c8ca24ec060abf4bebc6b144cb89cba638c720546b1cf538"}, - {file = "Pillow-10.0.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:d737a602fbd82afd892ca746392401b634e278cb65d55c4b7a8f48e9ef8d008d"}, - {file = "Pillow-10.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:3a82c40d706d9aa9734289740ce26460a11aeec2d9c79b7af87bb35f0073c12f"}, - {file = "Pillow-10.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:bc2ec7c7b5d66b8ec9ce9f720dbb5fa4bace0f545acd34870eff4a369b44bf37"}, - {file = "Pillow-10.0.0-cp312-cp312-macosx_10_10_x86_64.whl", hash = "sha256:d80cf684b541685fccdd84c485b31ce73fc5c9b5d7523bf1394ce134a60c6883"}, - {file = "Pillow-10.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:76de421f9c326da8f43d690110f0e79fe3ad1e54be811545d7d91898b4c8493e"}, - {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:81ff539a12457809666fef6624684c008e00ff6bf455b4b89fd00a140eecd640"}, - {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce543ed15570eedbb85df19b0a1a7314a9c8141a36ce089c0a894adbfccb4568"}, - {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:685ac03cc4ed5ebc15ad5c23bc555d68a87777586d970c2c3e216619a5476223"}, - {file = "Pillow-10.0.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:d72e2ecc68a942e8cf9739619b7f408cc7b272b279b56b2c83c6123fcfa5cdff"}, - {file = "Pillow-10.0.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:d50b6aec14bc737742ca96e85d6d0a5f9bfbded018264b3b70ff9d8c33485551"}, - {file = "Pillow-10.0.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:00e65f5e822decd501e374b0650146063fbb30a7264b4d2744bdd7b913e0cab5"}, - {file = "Pillow-10.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:f31f9fdbfecb042d046f9d91270a0ba28368a723302786c0009ee9b9f1f60199"}, - {file = "Pillow-10.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:1ce91b6ec08d866b14413d3f0bbdea7e24dfdc8e59f562bb77bc3fe60b6144ca"}, - {file = "Pillow-10.0.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:349930d6e9c685c089284b013478d6f76e3a534e36ddfa912cde493f235372f3"}, - {file = "Pillow-10.0.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3a684105f7c32488f7153905a4e3015a3b6c7182e106fe3c37fbb5ef3e6994c3"}, - {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b4f69b3700201b80bb82c3a97d5e9254084f6dd5fb5b16fc1a7b974260f89f43"}, - {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3f07ea8d2f827d7d2a49ecf1639ec02d75ffd1b88dcc5b3a61bbb37a8759ad8d"}, - {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:040586f7d37b34547153fa383f7f9aed68b738992380ac911447bb78f2abe530"}, - {file = "Pillow-10.0.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:f88a0b92277de8e3ca715a0d79d68dc82807457dae3ab8699c758f07c20b3c51"}, - {file = "Pillow-10.0.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:c7cf14a27b0d6adfaebb3ae4153f1e516df54e47e42dcc073d7b3d76111a8d86"}, - {file = "Pillow-10.0.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:3400aae60685b06bb96f99a21e1ada7bc7a413d5f49bce739828ecd9391bb8f7"}, - {file = "Pillow-10.0.0-cp38-cp38-win_amd64.whl", hash = "sha256:dbc02381779d412145331789b40cc7b11fdf449e5d94f6bc0b080db0a56ea3f0"}, - {file = "Pillow-10.0.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:9211e7ad69d7c9401cfc0e23d49b69ca65ddd898976d660a2fa5904e3d7a9baa"}, - {file = "Pillow-10.0.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:faaf07ea35355b01a35cb442dd950d8f1bb5b040a7787791a535de13db15ed90"}, - {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c9f72a021fbb792ce98306ffb0c348b3c9cb967dce0f12a49aa4c3d3fdefa967"}, - {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9f7c16705f44e0504a3a2a14197c1f0b32a95731d251777dcb060aa83022cb2d"}, - {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:76edb0a1fa2b4745fb0c99fb9fb98f8b180a1bbceb8be49b087e0b21867e77d3"}, - {file = "Pillow-10.0.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:368ab3dfb5f49e312231b6f27b8820c823652b7cd29cfbd34090565a015e99ba"}, - {file = "Pillow-10.0.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:608bfdee0d57cf297d32bcbb3c728dc1da0907519d1784962c5f0c68bb93e5a3"}, - {file = "Pillow-10.0.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:5c6e3df6bdd396749bafd45314871b3d0af81ff935b2d188385e970052091017"}, - {file = "Pillow-10.0.0-cp39-cp39-win_amd64.whl", hash = "sha256:7be600823e4c8631b74e4a0d38384c73f680e6105a7d3c6824fcf226c178c7e6"}, - {file = "Pillow-10.0.0-pp310-pypy310_pp73-macosx_10_10_x86_64.whl", hash = "sha256:92be919bbc9f7d09f7ae343c38f5bb21c973d2576c1d45600fce4b74bafa7ac0"}, - {file = "Pillow-10.0.0-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f8182b523b2289f7c415f589118228d30ac8c355baa2f3194ced084dac2dbba"}, - {file = "Pillow-10.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:38250a349b6b390ee6047a62c086d3817ac69022c127f8a5dc058c31ccef17f3"}, - {file = "Pillow-10.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:88af2003543cc40c80f6fca01411892ec52b11021b3dc22ec3bc9d5afd1c5334"}, - {file = "Pillow-10.0.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:c189af0545965fa8d3b9613cfdb0cd37f9d71349e0f7750e1fd704648d475ed2"}, - {file = "Pillow-10.0.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce7b031a6fc11365970e6a5686d7ba8c63e4c1cf1ea143811acbb524295eabed"}, - {file = "Pillow-10.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:db24668940f82321e746773a4bc617bfac06ec831e5c88b643f91f122a785684"}, - {file = "Pillow-10.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:efe8c0681042536e0d06c11f48cebe759707c9e9abf880ee213541c5b46c5bf3"}, - {file = "Pillow-10.0.0.tar.gz", hash = "sha256:9c82b5b3e043c7af0d95792d0d20ccf68f61a1fec6b3530e718b688422727396"}, + {file = "Pillow-9.5.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:ace6ca218308447b9077c14ea4ef381ba0b67ee78d64046b3f19cf4e1139ad16"}, + {file = "Pillow-9.5.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d3d403753c9d5adc04d4694d35cf0391f0f3d57c8e0030aac09d7678fa8030aa"}, + {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ba1b81ee69573fe7124881762bb4cd2e4b6ed9dd28c9c60a632902fe8db8b38"}, + {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fe7e1c262d3392afcf5071df9afa574544f28eac825284596ac6db56e6d11062"}, + {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8f36397bf3f7d7c6a3abdea815ecf6fd14e7fcd4418ab24bae01008d8d8ca15e"}, + {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:252a03f1bdddce077eff2354c3861bf437c892fb1832f75ce813ee94347aa9b5"}, + {file = "Pillow-9.5.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:85ec677246533e27770b0de5cf0f9d6e4ec0c212a1f89dfc941b64b21226009d"}, + {file = "Pillow-9.5.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:b416f03d37d27290cb93597335a2f85ed446731200705b22bb927405320de903"}, + {file = "Pillow-9.5.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:1781a624c229cb35a2ac31cc4a77e28cafc8900733a864870c49bfeedacd106a"}, + {file = "Pillow-9.5.0-cp310-cp310-win32.whl", hash = "sha256:8507eda3cd0608a1f94f58c64817e83ec12fa93a9436938b191b80d9e4c0fc44"}, + {file = "Pillow-9.5.0-cp310-cp310-win_amd64.whl", hash = "sha256:d3c6b54e304c60c4181da1c9dadf83e4a54fd266a99c70ba646a9baa626819eb"}, + {file = "Pillow-9.5.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:7ec6f6ce99dab90b52da21cf0dc519e21095e332ff3b399a357c187b1a5eee32"}, + {file = "Pillow-9.5.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:560737e70cb9c6255d6dcba3de6578a9e2ec4b573659943a5e7e4af13f298f5c"}, + {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:96e88745a55b88a7c64fa49bceff363a1a27d9a64e04019c2281049444a571e3"}, + {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d9c206c29b46cfd343ea7cdfe1232443072bbb270d6a46f59c259460db76779a"}, + {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cfcc2c53c06f2ccb8976fb5c71d448bdd0a07d26d8e07e321c103416444c7ad1"}, + {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:a0f9bb6c80e6efcde93ffc51256d5cfb2155ff8f78292f074f60f9e70b942d99"}, + {file = "Pillow-9.5.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:8d935f924bbab8f0a9a28404422da8af4904e36d5c33fc6f677e4c4485515625"}, + {file = "Pillow-9.5.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:fed1e1cf6a42577953abbe8e6cf2fe2f566daebde7c34724ec8803c4c0cda579"}, + {file = "Pillow-9.5.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:c1170d6b195555644f0616fd6ed929dfcf6333b8675fcca044ae5ab110ded296"}, + {file = "Pillow-9.5.0-cp311-cp311-win32.whl", hash = "sha256:54f7102ad31a3de5666827526e248c3530b3a33539dbda27c6843d19d72644ec"}, + {file = "Pillow-9.5.0-cp311-cp311-win_amd64.whl", hash = "sha256:cfa4561277f677ecf651e2b22dc43e8f5368b74a25a8f7d1d4a3a243e573f2d4"}, + {file = "Pillow-9.5.0-cp311-cp311-win_arm64.whl", hash = "sha256:965e4a05ef364e7b973dd17fc765f42233415974d773e82144c9bbaaaea5d089"}, + {file = "Pillow-9.5.0-cp312-cp312-win32.whl", hash = "sha256:22baf0c3cf0c7f26e82d6e1adf118027afb325e703922c8dfc1d5d0156bb2eeb"}, + {file = "Pillow-9.5.0-cp312-cp312-win_amd64.whl", hash = "sha256:432b975c009cf649420615388561c0ce7cc31ce9b2e374db659ee4f7d57a1f8b"}, + {file = "Pillow-9.5.0-cp37-cp37m-macosx_10_10_x86_64.whl", hash = "sha256:5d4ebf8e1db4441a55c509c4baa7a0587a0210f7cd25fcfe74dbbce7a4bd1906"}, + {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:375f6e5ee9620a271acb6820b3d1e94ffa8e741c0601db4c0c4d3cb0a9c224bf"}, + {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:99eb6cafb6ba90e436684e08dad8be1637efb71c4f2180ee6b8f940739406e78"}, + {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2dfaaf10b6172697b9bceb9a3bd7b951819d1ca339a5ef294d1f1ac6d7f63270"}, + {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:763782b2e03e45e2c77d7779875f4432e25121ef002a41829d8868700d119392"}, + {file = "Pillow-9.5.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:35f6e77122a0c0762268216315bf239cf52b88865bba522999dc38f1c52b9b47"}, + {file = "Pillow-9.5.0-cp37-cp37m-win32.whl", hash = "sha256:aca1c196f407ec7cf04dcbb15d19a43c507a81f7ffc45b690899d6a76ac9fda7"}, + {file = "Pillow-9.5.0-cp37-cp37m-win_amd64.whl", hash = "sha256:322724c0032af6692456cd6ed554bb85f8149214d97398bb80613b04e33769f6"}, + {file = "Pillow-9.5.0-cp38-cp38-macosx_10_10_x86_64.whl", hash = "sha256:a0aa9417994d91301056f3d0038af1199eb7adc86e646a36b9e050b06f526597"}, + {file = "Pillow-9.5.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:f8286396b351785801a976b1e85ea88e937712ee2c3ac653710a4a57a8da5d9c"}, + {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c830a02caeb789633863b466b9de10c015bded434deb3ec87c768e53752ad22a"}, + {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:fbd359831c1657d69bb81f0db962905ee05e5e9451913b18b831febfe0519082"}, + {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f8fc330c3370a81bbf3f88557097d1ea26cd8b019d6433aa59f71195f5ddebbf"}, + {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_28_aarch64.whl", hash = "sha256:7002d0797a3e4193c7cdee3198d7c14f92c0836d6b4a3f3046a64bd1ce8df2bf"}, + {file = "Pillow-9.5.0-cp38-cp38-manylinux_2_28_x86_64.whl", hash = "sha256:229e2c79c00e85989a34b5981a2b67aa079fd08c903f0aaead522a1d68d79e51"}, + {file = "Pillow-9.5.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9adf58f5d64e474bed00d69bcd86ec4bcaa4123bfa70a65ce72e424bfb88ed96"}, + {file = "Pillow-9.5.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:662da1f3f89a302cc22faa9f14a262c2e3951f9dbc9617609a47521c69dd9f8f"}, + {file = "Pillow-9.5.0-cp38-cp38-win32.whl", hash = "sha256:6608ff3bf781eee0cd14d0901a2b9cc3d3834516532e3bd673a0a204dc8615fc"}, + {file = "Pillow-9.5.0-cp38-cp38-win_amd64.whl", hash = "sha256:e49eb4e95ff6fd7c0c402508894b1ef0e01b99a44320ba7d8ecbabefddcc5569"}, + {file = "Pillow-9.5.0-cp39-cp39-macosx_10_10_x86_64.whl", hash = "sha256:482877592e927fd263028c105b36272398e3e1be3269efda09f6ba21fd83ec66"}, + {file = "Pillow-9.5.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3ded42b9ad70e5f1754fb7c2e2d6465a9c842e41d178f262e08b8c85ed8a1d8e"}, + {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c446d2245ba29820d405315083d55299a796695d747efceb5717a8b450324115"}, + {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8aca1152d93dcc27dc55395604dcfc55bed5f25ef4c98716a928bacba90d33a3"}, + {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:608488bdcbdb4ba7837461442b90ea6f3079397ddc968c31265c1e056964f1ef"}, + {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_28_aarch64.whl", hash = "sha256:60037a8db8750e474af7ffc9faa9b5859e6c6d0a50e55c45576bf28be7419705"}, + {file = "Pillow-9.5.0-cp39-cp39-manylinux_2_28_x86_64.whl", hash = "sha256:07999f5834bdc404c442146942a2ecadd1cb6292f5229f4ed3b31e0a108746b1"}, + {file = "Pillow-9.5.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:a127ae76092974abfbfa38ca2d12cbeddcdeac0fb71f9627cc1135bedaf9d51a"}, + {file = "Pillow-9.5.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:489f8389261e5ed43ac8ff7b453162af39c3e8abd730af8363587ba64bb2e865"}, + {file = "Pillow-9.5.0-cp39-cp39-win32.whl", hash = "sha256:9b1af95c3a967bf1da94f253e56b6286b50af23392a886720f563c547e48e964"}, + {file = "Pillow-9.5.0-cp39-cp39-win_amd64.whl", hash = "sha256:77165c4a5e7d5a284f10a6efaa39a0ae8ba839da344f20b111d62cc932fa4e5d"}, + {file = "Pillow-9.5.0-pp38-pypy38_pp73-macosx_10_10_x86_64.whl", hash = "sha256:833b86a98e0ede388fa29363159c9b1a294b0905b5128baf01db683672f230f5"}, + {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:aaf305d6d40bd9632198c766fb64f0c1a83ca5b667f16c1e79e1661ab5060140"}, + {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0852ddb76d85f127c135b6dd1f0bb88dbb9ee990d2cd9aa9e28526c93e794fba"}, + {file = "Pillow-9.5.0-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:91ec6fe47b5eb5a9968c79ad9ed78c342b1f97a091677ba0e012701add857829"}, + {file = "Pillow-9.5.0-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:cb841572862f629b99725ebaec3287fc6d275be9b14443ea746c1dd325053cbd"}, + {file = "Pillow-9.5.0-pp39-pypy39_pp73-macosx_10_10_x86_64.whl", hash = "sha256:c380b27d041209b849ed246b111b7c166ba36d7933ec6e41175fd15ab9eb1572"}, + {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7c9af5a3b406a50e313467e3565fc99929717f780164fe6fbb7704edba0cebbe"}, + {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5671583eab84af046a397d6d0ba25343c00cd50bce03787948e0fff01d4fd9b1"}, + {file = "Pillow-9.5.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:84a6f19ce086c1bf894644b43cd129702f781ba5751ca8572f08aa40ef0ab7b7"}, + {file = "Pillow-9.5.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:1e7723bd90ef94eda669a3c2c19d549874dd5badaeefabefd26053304abe5799"}, + {file = "Pillow-9.5.0.tar.gz", hash = "sha256:bf548479d336726d7a0eceb6e767e179fbde37833ae42794602631a070d630f1"}, ] [package.extras] @@ -8783,40 +8792,40 @@ tests = ["cython", "littleutils", "pygments", "pytest", "typeguard"] [[package]] name = "streamlit" -version = "1.22.0" +version = "1.26.0" description = "A faster way to build and share data apps" optional = true -python-versions = ">=3.7, !=3.9.7" -files = [ - {file = "streamlit-1.22.0-py2.py3-none-any.whl", hash = "sha256:520dd9b9e6efb559b5a9a22feadb48b1e6f0340ec83da3514810059fdecd4167"}, - {file = "streamlit-1.22.0.tar.gz", hash = "sha256:5bef9bf8deef32814d9565c9df48331e6357eb0b90dabc3ec4f53c44fb34fc73"}, -] - -[package.dependencies] -altair = ">=3.2.0,<5" -blinker = ">=1.0.0" -cachetools = ">=4.0" -click = ">=7.0" -gitpython = "!=3.1.19" -importlib-metadata = ">=1.4" -numpy = "*" -packaging = ">=14.1" -pandas = ">=0.25,<3" -pillow = ">=6.2.0" -protobuf = ">=3.12,<4" -pyarrow = ">=4.0" -pydeck = ">=0.1.dev5" -pympler = ">=0.9" -python-dateutil = "*" -requests = ">=2.4" -rich = ">=10.11.0" -tenacity = ">=8.0.0,<9" -toml = "*" -tornado = ">=6.0.3" -typing-extensions = ">=3.10.0.0" -tzlocal = ">=1.1" -validators = ">=0.2" -watchdog = {version = "*", markers = "platform_system != \"Darwin\""} +python-versions = ">=3.8, !=3.9.7" +files = [ + {file = "streamlit-1.26.0-py2.py3-none-any.whl", hash = "sha256:2bfdac041816e2e1ba27f061d40112afe61e0d4e72d25f354b38ba81107b4cb3"}, + {file = "streamlit-1.26.0.tar.gz", hash = "sha256:25475fb15a3cc9fb184945f3fc936f011998bd8386e0c892febe14c9625bf47a"}, +] + +[package.dependencies] +altair = ">=4.0,<6" +blinker = ">=1.0.0,<2" +cachetools = ">=4.0,<6" +click = ">=7.0,<9" +gitpython = ">=3.0.7,<3.1.19 || >3.1.19,<4" +importlib-metadata = ">=1.4,<7" +numpy = ">=1.19.3,<2" +packaging = ">=16.8,<24" +pandas = ">=1.3.0,<3" +pillow = ">=7.1.0,<10" +protobuf = ">=3.20,<5" +pyarrow = ">=6.0" +pydeck = ">=0.8,<1" +pympler = ">=0.9,<2" +python-dateutil = ">=2.7.3,<3" +requests = ">=2.18,<3" +rich = ">=10.14.0,<14" +tenacity = ">=8.1.0,<9" +toml = ">=0.10.1,<2" +tornado = ">=6.0.3,<7" +typing-extensions = ">=4.1.0,<5" +tzlocal = ">=1.1,<5" +validators = ">=0.2,<1" +watchdog = {version = ">=2.1.5", markers = "platform_system != \"Darwin\""} [package.extras] snowflake = ["snowflake-snowpark-python"] @@ -10472,7 +10481,7 @@ clarifai = ["clarifai"] cohere = ["cohere"] docarray = ["docarray"] embeddings = ["sentence-transformers"] -extended-testing = ["amazon-textract-caller", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "dashvector", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "markdownify", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "sentence-transformers", "sqlite-vss", "streamlit", "sympy", "telethon", "tqdm", "vowpal-wabbit-next", "xata", "xmltodict"] +extended-testing = ["amazon-textract-caller", "assemblyai", "atlassian-python-api", "beautifulsoup4", "bibtexparser", "cassio", "chardet", "dashvector", "esprima", "faiss-cpu", "feedparser", "geopandas", "gitpython", "gql", "html2text", "jinja2", "jq", "lxml", "markdownify", "mwparserfromhell", "mwxml", "newspaper3k", "openai", "openai", "openapi-schema-pydantic", "pandas", "pdfminer-six", "pgvector", "psychicapi", "py-trello", "pymupdf", "pypdf", "pypdfium2", "pyspark", "rank-bm25", "rapidfuzz", "requests-toolbelt", "scikit-learn", "sqlite-vss", "streamlit", "sympy", "telethon", "tqdm", "xata", "xmltodict"] javascript = ["esprima"] llms = ["clarifai", "cohere", "huggingface_hub", "manifest-ml", "nlpcloud", "openai", "openlm", "torch", "transformers"] openai = ["openai", "tiktoken"] @@ -10482,4 +10491,4 @@ text-helpers = ["chardet"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "b42a2fe9986973ebfe9804276429dff258c1124ef2df5985fa56ecd0cde7e7e1" +content-hash = "b63078268a80c07577b432114302f4f86d47be25b83a245affb0dbc999fb2c1f" diff --git a/libs/langchain/pyproject.toml b/libs/langchain/pyproject.toml index e9df12a991..ac8f5c45df 100644 --- a/libs/langchain/pyproject.toml +++ b/libs/langchain/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langchain" -version = "0.0.282" +version = "0.0.286" description = "Building applications with LLMs through composability" authors = [] license = "MIT" diff --git a/libs/langchain/tests/integration_tests/chains/test_retrieval_qa_with_sources.py b/libs/langchain/tests/integration_tests/chains/test_retrieval_qa_with_sources.py new file mode 100644 index 0000000000..70ee98513e --- /dev/null +++ b/libs/langchain/tests/integration_tests/chains/test_retrieval_qa_with_sources.py @@ -0,0 +1,28 @@ +"""Test RetrievalQA functionality.""" +from langchain.chains import RetrievalQAWithSourcesChain +from langchain.chains.loading import load_chain +from langchain.document_loaders import DirectoryLoader +from langchain.embeddings.openai import OpenAIEmbeddings +from langchain.llms import OpenAI +from langchain.text_splitter import CharacterTextSplitter +from langchain.vectorstores import FAISS + + +def test_retrieval_qa_with_sources_chain_saving_loading(tmp_path: str) -> None: + """Test saving and loading.""" + loader = DirectoryLoader("docs/extras/modules/", glob="*.txt") + documents = loader.load() + text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) + texts = text_splitter.split_documents(documents) + embeddings = OpenAIEmbeddings() + docsearch = FAISS.from_documents(texts, embeddings) + qa = RetrievalQAWithSourcesChain.from_llm( + llm=OpenAI(), retriever=docsearch.as_retriever() + ) + qa("What did the president say about Ketanji Brown Jackson?") + + file_path = tmp_path + "/RetrievalQAWithSourcesChain.yaml" + qa.save(file_path=file_path) + qa_loaded = load_chain(file_path, retriever=docsearch.as_retriever()) + + assert qa_loaded == qa diff --git a/libs/langchain/tests/integration_tests/chat_models/test_konko.py b/libs/langchain/tests/integration_tests/chat_models/test_konko.py new file mode 100644 index 0000000000..c47bbbb3f0 --- /dev/null +++ b/libs/langchain/tests/integration_tests/chat_models/test_konko.py @@ -0,0 +1,178 @@ +"""Evaluate ChatKonko Interface.""" +from typing import Any + +import pytest + +from langchain.callbacks.manager import CallbackManager +from langchain.chat_models.konko import ChatKonko +from langchain.schema import ( + ChatGeneration, + ChatResult, + LLMResult, +) +from langchain.schema.messages import BaseMessage, HumanMessage, SystemMessage +from tests.unit_tests.callbacks.fake_callback_handler import FakeCallbackHandler + + +def test_konko_chat_test() -> None: + """Evaluate basic ChatKonko functionality.""" + chat_instance = ChatKonko(max_tokens=10) + msg = HumanMessage(content="Hi") + chat_response = chat_instance([msg]) + assert isinstance(chat_response, BaseMessage) + assert isinstance(chat_response.content, str) + + +def test_konko_chat_test_openai() -> None: + """Evaluate basic ChatKonko functionality.""" + chat_instance = ChatKonko(max_tokens=10, model="gpt-3.5-turbo") + msg = HumanMessage(content="Hi") + chat_response = chat_instance([msg]) + assert isinstance(chat_response, BaseMessage) + assert isinstance(chat_response.content, str) + + +def test_konko_model_test() -> None: + """Check how ChatKonko manages model_name.""" + chat_instance = ChatKonko(model="alpha") + assert chat_instance.model == "alpha" + chat_instance = ChatKonko(model="beta") + assert chat_instance.model == "beta" + + +def test_konko_available_model_test() -> None: + """Check how ChatKonko manages model_name.""" + chat_instance = ChatKonko(max_tokens=10, n=2) + res = chat_instance.get_available_models() + assert isinstance(res, set) + + +def test_konko_system_msg_test() -> None: + """Evaluate ChatKonko's handling of system messages.""" + chat_instance = ChatKonko(max_tokens=10) + sys_msg = SystemMessage(content="Initiate user chat.") + user_msg = HumanMessage(content="Hi there") + chat_response = chat_instance([sys_msg, user_msg]) + assert isinstance(chat_response, BaseMessage) + assert isinstance(chat_response.content, str) + + +def test_konko_generation_test() -> None: + """Check ChatKonko's generation ability.""" + chat_instance = ChatKonko(max_tokens=10, n=2) + msg = HumanMessage(content="Hi") + gen_response = chat_instance.generate([[msg], [msg]]) + assert isinstance(gen_response, LLMResult) + assert len(gen_response.generations) == 2 + for gen_list in gen_response.generations: + assert len(gen_list) == 2 + for gen in gen_list: + assert isinstance(gen, ChatGeneration) + assert isinstance(gen.text, str) + assert gen.text == gen.message.content + + +def test_konko_multiple_outputs_test() -> None: + """Test multiple completions with ChatKonko.""" + chat_instance = ChatKonko(max_tokens=10, n=5) + msg = HumanMessage(content="Hi") + gen_response = chat_instance._generate([msg]) + assert isinstance(gen_response, ChatResult) + assert len(gen_response.generations) == 5 + for gen in gen_response.generations: + assert isinstance(gen.message, BaseMessage) + assert isinstance(gen.message.content, str) + + +def test_konko_streaming_callback_test() -> None: + """Evaluate streaming's token callback functionality.""" + callback_instance = FakeCallbackHandler() + callback_mgr = CallbackManager([callback_instance]) + chat_instance = ChatKonko( + max_tokens=10, + streaming=True, + temperature=0, + callback_manager=callback_mgr, + verbose=True, + ) + msg = HumanMessage(content="Hi") + chat_response = chat_instance([msg]) + assert callback_instance.llm_streams > 0 + assert isinstance(chat_response, BaseMessage) + + +def test_konko_streaming_info_test() -> None: + """Ensure generation details are retained during streaming.""" + + class TestCallback(FakeCallbackHandler): + data_store: dict = {} + + def on_llm_end(self, *args: Any, **kwargs: Any) -> Any: + self.data_store["generation"] = args[0] + + callback_instance = TestCallback() + callback_mgr = CallbackManager([callback_instance]) + chat_instance = ChatKonko( + max_tokens=2, + temperature=0, + callback_manager=callback_mgr, + ) + list(chat_instance.stream("hey")) + gen_data = callback_instance.data_store["generation"] + assert gen_data.generations[0][0].text == " Hey" + + +def test_konko_llm_model_name_test() -> None: + """Check if llm_output has model info.""" + chat_instance = ChatKonko(max_tokens=10) + msg = HumanMessage(content="Hi") + llm_data = chat_instance.generate([[msg]]) + assert llm_data.llm_output is not None + assert llm_data.llm_output["model_name"] == chat_instance.model + + +def test_konko_streaming_model_name_test() -> None: + """Check model info during streaming.""" + chat_instance = ChatKonko(max_tokens=10, streaming=True) + msg = HumanMessage(content="Hi") + llm_data = chat_instance.generate([[msg]]) + assert llm_data.llm_output is not None + assert llm_data.llm_output["model_name"] == chat_instance.model + + +def test_konko_streaming_param_validation_test() -> None: + """Ensure correct token callback during streaming.""" + with pytest.raises(ValueError): + ChatKonko( + max_tokens=10, + streaming=True, + temperature=0, + n=5, + ) + + +def test_konko_additional_args_test() -> None: + """Evaluate extra arguments for ChatKonko.""" + chat_instance = ChatKonko(extra=3, max_tokens=10) + assert chat_instance.max_tokens == 10 + assert chat_instance.model_kwargs == {"extra": 3} + + chat_instance = ChatKonko(extra=3, model_kwargs={"addition": 2}) + assert chat_instance.model_kwargs == {"extra": 3, "addition": 2} + + with pytest.raises(ValueError): + ChatKonko(extra=3, model_kwargs={"extra": 2}) + + with pytest.raises(ValueError): + ChatKonko(model_kwargs={"temperature": 0.2}) + + with pytest.raises(ValueError): + ChatKonko(model_kwargs={"model": "text-davinci-003"}) + + +def test_konko_token_streaming_test() -> None: + """Check token streaming for ChatKonko.""" + chat_instance = ChatKonko(max_tokens=10) + + for token in chat_instance.stream("Just a test"): + assert isinstance(token.content, str) diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_polars_dataframe.py b/libs/langchain/tests/integration_tests/document_loaders/test_polars_dataframe.py index 03f5070120..2858b41e8e 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_polars_dataframe.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_polars_dataframe.py @@ -34,8 +34,6 @@ def test_load_returns_list_of_documents(sample_data_frame: pl.DataFrame) -> None def test_load_converts_dataframe_columns_to_document_metadata( sample_data_frame: pl.DataFrame, ) -> None: - import polars as pl - loader = PolarsDataFrameLoader(sample_data_frame) docs = loader.load() diff --git a/libs/langchain/tests/integration_tests/document_loaders/test_url_playwright.py b/libs/langchain/tests/integration_tests/document_loaders/test_url_playwright.py index 7bea1c6dee..eb53682d75 100644 --- a/libs/langchain/tests/integration_tests/document_loaders/test_url_playwright.py +++ b/libs/langchain/tests/integration_tests/document_loaders/test_url_playwright.py @@ -7,7 +7,9 @@ from langchain.document_loaders import PlaywrightURLLoader from langchain.document_loaders.url_playwright import PlaywrightEvaluator if TYPE_CHECKING: - from playwright.async_api import AsyncBrowser, AsyncPage, AsyncResponse + from playwright.async_api import Browser as AsyncBrowser + from playwright.async_api import Page as AsyncPage + from playwright.async_api import Response as AsyncResponse from playwright.sync_api import Browser, Page, Response diff --git a/libs/langchain/tests/integration_tests/llms/test_confident.py b/libs/langchain/tests/integration_tests/llms/test_confident.py new file mode 100644 index 0000000000..069f221f6e --- /dev/null +++ b/libs/langchain/tests/integration_tests/llms/test_confident.py @@ -0,0 +1,26 @@ +"""Test Confident.""" + + +def test_confident_deepeval() -> None: + """Test valid call to Beam.""" + from deepeval.metrics.answer_relevancy import AnswerRelevancy + + from langchain.callbacks.confident_callback import DeepEvalCallbackHandler + from langchain.llms import OpenAI + + answer_relevancy = AnswerRelevancy(minimum_score=0.3) + deepeval_callback = DeepEvalCallbackHandler( + implementation_name="exampleImplementation", metrics=[answer_relevancy] + ) + llm = OpenAI( + temperature=0, + callbacks=[deepeval_callback], + verbose=True, + openai_api_key="<YOUR_API_KEY>", + ) + llm.generate( + [ + "What is the best evaluation tool out there? (no bias at all)", + ] + ) + assert answer_relevancy.is_successful(), "Answer not relevant" diff --git a/libs/langchain/tests/integration_tests/smith/evaluation/test_runner_utils.py b/libs/langchain/tests/integration_tests/smith/evaluation/test_runner_utils.py index 9696515a9c..4db2e88c97 100644 --- a/libs/langchain/tests/integration_tests/smith/evaluation/test_runner_utils.py +++ b/libs/langchain/tests/integration_tests/smith/evaluation/test_runner_utils.py @@ -5,7 +5,6 @@ import pytest from langsmith import Client as Client from langsmith.schemas import DataType -from langchain.callbacks.tracers.evaluation import wait_for_all_evaluators from langchain.chains.llm import LLMChain from langchain.chat_models import ChatOpenAI from langchain.evaluation import EvaluatorType @@ -22,7 +21,6 @@ def _check_all_feedback_passed(_project_name: str, client: Client) -> None: # chain or llm passes for the feedback provided. runs = list(client.list_runs(project_name=_project_name, execution_order=1)) assert len(runs) == 4 - wait_for_all_evaluators() feedback = list(client.list_feedback(run_ids=[run.id for run in runs])) assert len(feedback) == 8 assert all([f.score == 1 for f in feedback]) diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_nucliadb.py b/libs/langchain/tests/integration_tests/vectorstores/test_nucliadb.py new file mode 100644 index 0000000000..1cfeea0da4 --- /dev/null +++ b/libs/langchain/tests/integration_tests/vectorstores/test_nucliadb.py @@ -0,0 +1,98 @@ +from typing import Any +from unittest import mock + +from langchain.vectorstores.nucliadb import NucliaDB + + +class attrdict(dict): + def __getitem__(self, key: str) -> Any: + value = dict.__getitem__(self, key) + return attrdict(value) if isinstance(value, dict) else value + + __getattr__ = __getitem__ + + +def FakeCreate(**args: Any) -> Any: + def fn(self: Any, **kwargs: Any) -> str: + return "fake_uuid" + + return fn + + +def FakeDelete(**args: Any) -> Any: + def fn(self: Any, **kwargs: Any) -> None: + return None + + return fn + + +def FakeFind(**args: Any) -> Any: + def fn(self: Any, **kwargs: Any) -> Any: + return attrdict( + { + "resources": { + "123": attrdict( + { + "fields": { + "456": attrdict( + { + "paragraphs": { + "123/t/text/0-14": attrdict( + { + "text": "This is a test", + "order": 0, + } + ), + } + } + ) + }, + "data": { + "texts": { + "text": { + "body": "This is a test", + } + } + }, + "extra": attrdict({"metadata": {"some": "metadata"}}), + } + ) + } + } + ) + + return fn + + +def test_add_texts() -> None: + with mock.patch( + "nuclia.sdk.resource.NucliaResource.create", + new_callable=FakeCreate, + ): + ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY") + assert ndb.is_local is False + ids = ndb.add_texts(["This is a new test", "This is a second test"]) + assert len(ids) == 2 + + +def test_delete() -> None: + with mock.patch( + "nuclia.sdk.resource.NucliaResource.delete", + new_callable=FakeDelete, + ): + ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY") + success = ndb.delete(["123", "456"]) + assert success + + +def test_search() -> None: + with mock.patch( + "nuclia.sdk.search.NucliaSearch.find", + new_callable=FakeFind, + ): + ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY") + results = ndb.similarity_search("Who was inspired by Ada Lovelace?") + assert len(results) == 1 + assert results[0].page_content == "This is a test" + assert results[0].metadata["extra"]["some"] == "metadata" + assert results[0].metadata["value"]["body"] == "This is a test" diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_pgvector.py b/libs/langchain/tests/integration_tests/vectorstores/test_pgvector.py index 6d6028497c..b0dc5b27b7 100644 --- a/libs/langchain/tests/integration_tests/vectorstores/test_pgvector.py +++ b/libs/langchain/tests/integration_tests/vectorstores/test_pgvector.py @@ -279,3 +279,31 @@ def test_pgvector_retriever_search_threshold_custom_normalization_fn() -> None: ) output = retriever.get_relevant_documents("foo") assert output == [] + + +def test_pgvector_max_marginal_relevance_search() -> None: + """Test max marginal relevance search.""" + texts = ["foo", "bar", "baz"] + docsearch = PGVector.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.max_marginal_relevance_search("foo", k=1, fetch_k=3) + assert output == [Document(page_content="foo")] + + +def test_pgvector_max_marginal_relevance_search_with_score() -> None: + """Test max marginal relevance search with relevance scores.""" + texts = ["foo", "bar", "baz"] + docsearch = PGVector.from_texts( + texts=texts, + collection_name="test_collection", + embedding=FakeEmbeddingsWithAdaDimension(), + connection_string=CONNECTION_STRING, + pre_delete_collection=True, + ) + output = docsearch.max_marginal_relevance_search_with_score("foo", k=1, fetch_k=3) + assert output == [(Document(page_content="foo"), 0.0)] diff --git a/libs/langchain/tests/integration_tests/vectorstores/test_redis.py b/libs/langchain/tests/integration_tests/vectorstores/test_redis.py index 3b7a4c7acc..6128a8445a 100644 --- a/libs/langchain/tests/integration_tests/vectorstores/test_redis.py +++ b/libs/langchain/tests/integration_tests/vectorstores/test_redis.py @@ -136,6 +136,32 @@ def test_redis_from_documents(texts: List[str]) -> None: assert drop(docsearch.index_name) +def test_custom_keys(texts: List[str]) -> None: + keys_in = ["test_key_1", "test_key_2", "test_key_3"] + docsearch, keys_out = Redis.from_texts_return_keys( + texts, FakeEmbeddings(), redis_url=TEST_REDIS_URL, keys=keys_in + ) + assert keys_in == keys_out + assert drop(docsearch.index_name) + + +def test_custom_keys_from_docs(texts: List[str]) -> None: + keys_in = ["test_key_1", "test_key_2", "test_key_3"] + docs = [Document(page_content=t, metadata={"a": "b"}) for t in texts] + + docsearch = Redis.from_documents( + docs, FakeEmbeddings(), redis_url=TEST_REDIS_URL, keys=keys_in + ) + client = docsearch.client + # test keys are correct + assert client.hget("test_key_1", "content") + # test metadata is stored + assert client.hget("test_key_1", "a") == bytes("b", "utf-8") + # test all keys are stored + assert client.hget("test_key_2", "content") + assert drop(docsearch.index_name) + + # -- test filters -- # @@ -187,12 +213,21 @@ def test_redis_filters_1( documents, FakeEmbeddings(), redis_url=TEST_REDIS_URL ) - output = docsearch.similarity_search("foo", k=3, filter=filter_expr) + sim_output = docsearch.similarity_search("foo", k=3, filter=filter_expr) + mmr_output = docsearch.max_marginal_relevance_search( + "foo", k=3, fetch_k=5, filter=filter_expr + ) - assert len(output) == expected_length + assert len(sim_output) == expected_length + assert len(mmr_output) == expected_length if expected_nums is not None: - for out in output: + for out in sim_output: + assert ( + out.metadata["text"] in expected_nums + or int(out.metadata["num"]) in expected_nums + ) + for out in mmr_output: assert ( out.metadata["text"] in expected_nums or int(out.metadata["num"]) in expected_nums @@ -302,7 +337,6 @@ def test_similarity_search_limit_distance(texts: List[str]) -> None: def test_similarity_search_with_score_with_limit_distance(texts: List[str]) -> None: """Test similarity search with score with limit score.""" - docsearch = Redis.from_texts( texts, ConsistentFakeEmbeddings(), redis_url=TEST_REDIS_URL ) @@ -317,6 +351,32 @@ def test_similarity_search_with_score_with_limit_distance(texts: List[str]) -> N assert drop(docsearch.index_name) +def test_max_marginal_relevance_search(texts: List[str]) -> None: + """Test max marginal relevance search.""" + docsearch = Redis.from_texts(texts, FakeEmbeddings(), redis_url=TEST_REDIS_URL) + + mmr_output = docsearch.max_marginal_relevance_search(texts[0], k=3, fetch_k=3) + sim_output = docsearch.similarity_search(texts[0], k=3) + assert mmr_output == sim_output + + mmr_output = docsearch.max_marginal_relevance_search(texts[0], k=2, fetch_k=3) + assert len(mmr_output) == 2 + assert mmr_output[0].page_content == texts[0] + assert mmr_output[1].page_content == texts[1] + + mmr_output = docsearch.max_marginal_relevance_search( + texts[0], k=2, fetch_k=3, lambda_mult=0.1 # more diversity + ) + assert len(mmr_output) == 2 + assert mmr_output[0].page_content == texts[0] + assert mmr_output[1].page_content == texts[2] + + # if fetch_k < k, then the output will be less than k + mmr_output = docsearch.max_marginal_relevance_search(texts[0], k=3, fetch_k=2) + assert len(mmr_output) == 2 + assert drop(docsearch.index_name) + + def test_delete(texts: List[str]) -> None: """Test deleting a new document""" docsearch = Redis.from_texts(texts, FakeEmbeddings(), redis_url=TEST_REDIS_URL) diff --git a/libs/langchain/tests/unit_tests/retrievers/self_query/test_redis.py b/libs/langchain/tests/unit_tests/retrievers/self_query/test_redis.py new file mode 100644 index 0000000000..62c225fe02 --- /dev/null +++ b/libs/langchain/tests/unit_tests/retrievers/self_query/test_redis.py @@ -0,0 +1,122 @@ +from typing import Dict, Tuple + +import pytest + +from langchain.chains.query_constructor.ir import ( + Comparator, + Comparison, + Operation, + Operator, + StructuredQuery, +) +from langchain.retrievers.self_query.redis import RedisTranslator +from langchain.vectorstores.redis.filters import ( + RedisFilterExpression, + RedisNum, + RedisTag, + RedisText, +) +from langchain.vectorstores.redis.schema import ( + NumericFieldSchema, + RedisModel, + TagFieldSchema, + TextFieldSchema, +) + + +@pytest.fixture +def translator() -> RedisTranslator: + schema = RedisModel( + text=[TextFieldSchema(name="bar")], + numeric=[NumericFieldSchema(name="foo")], + tag=[TagFieldSchema(name="tag")], + ) + return RedisTranslator(schema) + + +@pytest.mark.parametrize( + ("comp", "expected"), + [ + ( + Comparison(comparator=Comparator.LT, attribute="foo", value=1), + RedisNum("foo") < 1, + ), + ( + Comparison(comparator=Comparator.LIKE, attribute="bar", value="baz*"), + RedisText("bar") % "baz*", + ), + ( + Comparison( + comparator=Comparator.CONTAIN, attribute="tag", value=["blue", "green"] + ), + RedisTag("tag") == ["blue", "green"], + ), + ], +) +def test_visit_comparison( + translator: RedisTranslator, comp: Comparison, expected: RedisFilterExpression +) -> None: + comp = Comparison(comparator=Comparator.LT, attribute="foo", value=1) + expected = RedisNum("foo") < 1 + actual = translator.visit_comparison(comp) + assert str(expected) == str(actual) + + +def test_visit_operation(translator: RedisTranslator) -> None: + op = Operation( + operator=Operator.AND, + arguments=[ + Comparison(comparator=Comparator.LT, attribute="foo", value=2), + Comparison(comparator=Comparator.EQ, attribute="bar", value="baz"), + Comparison(comparator=Comparator.EQ, attribute="tag", value="high"), + ], + ) + expected = (RedisNum("foo") < 2) & ( + (RedisText("bar") == "baz") & (RedisTag("tag") == "high") + ) + actual = translator.visit_operation(op) + assert str(expected) == str(actual) + + +def test_visit_structured_query_no_filter(translator: RedisTranslator) -> None: + query = "What is the capital of France?" + + structured_query = StructuredQuery( + query=query, + filter=None, + ) + expected: Tuple[str, Dict] = (query, {}) + actual = translator.visit_structured_query(structured_query) + assert expected == actual + + +def test_visit_structured_query_comparison(translator: RedisTranslator) -> None: + query = "What is the capital of France?" + comp = Comparison(comparator=Comparator.GTE, attribute="foo", value=2) + structured_query = StructuredQuery( + query=query, + filter=comp, + ) + expected_filter = RedisNum("foo") >= 2 + actual_query, actual_filter = translator.visit_structured_query(structured_query) + assert actual_query == query + assert str(actual_filter["filter"]) == str(expected_filter) + + +def test_visit_structured_query_operation(translator: RedisTranslator) -> None: + query = "What is the capital of France?" + op = Operation( + operator=Operator.OR, + arguments=[ + Comparison(comparator=Comparator.EQ, attribute="foo", value=2), + Comparison(comparator=Comparator.CONTAIN, attribute="bar", value="baz"), + ], + ) + structured_query = StructuredQuery( + query=query, + filter=op, + ) + expected_filter = (RedisNum("foo") == 2) | (RedisText("bar") == "baz") + actual_query, actual_filter = translator.visit_structured_query(structured_query) + assert actual_query == query + assert str(actual_filter["filter"]) == str(expected_filter) diff --git a/libs/langchain/tests/unit_tests/retrievers/self_query/test_supabase.py b/libs/langchain/tests/unit_tests/retrievers/self_query/test_supabase.py new file mode 100644 index 0000000000..de9b04fabf --- /dev/null +++ b/libs/langchain/tests/unit_tests/retrievers/self_query/test_supabase.py @@ -0,0 +1,85 @@ +from typing import Dict, Tuple + +from langchain.chains.query_constructor.ir import ( + Comparator, + Comparison, + Operation, + Operator, + StructuredQuery, +) +from langchain.retrievers.self_query.supabase import SupabaseVectorTranslator + +DEFAULT_TRANSLATOR = SupabaseVectorTranslator() + + +def test_visit_comparison() -> None: + comp = Comparison(comparator=Comparator.LT, attribute="foo", value=["1", "2"]) + expected = "and(metadata->>foo.lt.1,metadata->>foo.lt.2)" + actual = DEFAULT_TRANSLATOR.visit_comparison(comp) + assert expected == actual + + +def test_visit_operation() -> None: + op = Operation( + operator=Operator.AND, + arguments=[ + Comparison(comparator=Comparator.LT, attribute="foo", value=2), + Comparison(comparator=Comparator.EQ, attribute="bar", value="baz"), + Comparison(comparator=Comparator.LT, attribute="abc", value=["1", "2"]), + ], + ) + expected = ( + "and(" + "metadata->foo.lt.2," + "metadata->>bar.eq.baz," + "and(metadata->>abc.lt.1,metadata->>abc.lt.2)" + ")" + ) + actual = DEFAULT_TRANSLATOR.visit_operation(op) + assert expected == actual + + +def test_visit_structured_query() -> None: + query = "What is the capital of France?" + structured_query = StructuredQuery( + query=query, + filter=None, + ) + expected: Tuple[str, Dict] = (query, {}) + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual + + comp = Comparison(comparator=Comparator.LT, attribute="foo", value=["1", "2"]) + expected = ( + query, + {"postgrest_filter": "and(metadata->>foo.lt.1,metadata->>foo.lt.2)"}, + ) + structured_query = StructuredQuery( + query=query, + filter=comp, + ) + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual + + op = Operation( + operator=Operator.AND, + arguments=[ + Comparison(comparator=Comparator.LT, attribute="foo", value=2), + Comparison(comparator=Comparator.EQ, attribute="bar", value="baz"), + Comparison(comparator=Comparator.LT, attribute="abc", value=["1", "2"]), + ], + ) + structured_query = StructuredQuery( + query=query, + filter=op, + ) + expected = ( + query, + { + "postgrest_filter": ( + "and(metadata->foo.lt.2,metadata->>bar.eq.baz,and(metadata->>abc.lt.1,metadata->>abc.lt.2))" + ) + }, + ) + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual diff --git a/libs/langchain/tests/unit_tests/retrievers/self_query/test_vectara.py b/libs/langchain/tests/unit_tests/retrievers/self_query/test_vectara.py new file mode 100644 index 0000000000..05c15f26ac --- /dev/null +++ b/libs/langchain/tests/unit_tests/retrievers/self_query/test_vectara.py @@ -0,0 +1,71 @@ +from typing import Dict, Tuple + +from langchain.chains.query_constructor.ir import ( + Comparator, + Comparison, + Operation, + Operator, + StructuredQuery, +) +from langchain.retrievers.self_query.vectara import VectaraTranslator + +DEFAULT_TRANSLATOR = VectaraTranslator() + + +def test_visit_comparison() -> None: + comp = Comparison(comparator=Comparator.LT, attribute="foo", value="1") + expected = "( doc.foo < '1' )" + actual = DEFAULT_TRANSLATOR.visit_comparison(comp) + assert expected == actual + + +def test_visit_operation() -> None: + op = Operation( + operator=Operator.AND, + arguments=[ + Comparison(comparator=Comparator.LT, attribute="foo", value=2), + Comparison(comparator=Comparator.EQ, attribute="bar", value="baz"), + Comparison(comparator=Comparator.LT, attribute="abc", value=1), + ], + ) + expected = "( ( doc.foo < 2 ) and ( doc.bar = 'baz' ) and ( doc.abc < 1 ) )" + actual = DEFAULT_TRANSLATOR.visit_operation(op) + assert expected == actual + + +def test_visit_structured_query() -> None: + query = "What is the capital of France?" + structured_query = StructuredQuery( + query=query, + filter=None, + limit=None, + ) + expected: Tuple[str, Dict] = (query, {}) + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual + + comp = Comparison(comparator=Comparator.LT, attribute="foo", value=1) + expected = (query, {"filter": "( doc.foo < 1 )"}) + structured_query = StructuredQuery( + query=query, + filter=comp, + limit=None, + ) + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual + + op = Operation( + operator=Operator.AND, + arguments=[ + Comparison(comparator=Comparator.LT, attribute="foo", value=2), + Comparison(comparator=Comparator.EQ, attribute="bar", value="baz"), + Comparison(comparator=Comparator.LT, attribute="abc", value=1), + ], + ) + structured_query = StructuredQuery(query=query, filter=op, limit=None) + expected = ( + query, + {"filter": "( ( doc.foo < 2 ) and ( doc.bar = 'baz' ) and ( doc.abc < 1 ) )"}, + ) + actual = DEFAULT_TRANSLATOR.visit_structured_query(structured_query) + assert expected == actual diff --git a/libs/langchain/tests/unit_tests/schema/runnable/__snapshots__/test_runnable.ambr b/libs/langchain/tests/unit_tests/schema/runnable/__snapshots__/test_runnable.ambr index 5ea21b13d6..63c0acc38d 100644 --- a/libs/langchain/tests/unit_tests/schema/runnable/__snapshots__/test_runnable.ambr +++ b/libs/langchain/tests/unit_tests/schema/runnable/__snapshots__/test_runnable.ambr @@ -467,7 +467,7 @@ # --- # name: test_combining_sequences.3 list([ - Run(id=UUID('00000000-0000-4000-8000-000000000000'), name='RunnableSequence', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='chain', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'runnable', 'RunnableSequence'], 'kwargs': {'first': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptTemplate'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'SystemMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': [], 'template': 'You are a nice assistant.', 'template_format': 'f-string', 'partial_variables': {}}}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'HumanMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': ['question'], 'template': '{question}', 'template_format': 'f-string', 'partial_variables': {}}}}}], 'input_variables': ['question']}}, 'middle': [{'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'chat_models', 'fake', 'FakeListChatModel'], 'repr': "FakeListChatModel(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, responses=['foo, bar'], sleep=None, i=0)"}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'output_parsers', 'list', 'CommaSeparatedListOutputParser'], 'kwargs': {}}, {'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'schema', 'runnable', 'base', 'RunnableLambda'], 'repr': 'RunnableLambda(...)'}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptTemplate'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'SystemMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': [], 'template': 'You are a nicer assistant.', 'template_format': 'f-string', 'partial_variables': {}}}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'HumanMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': ['question'], 'template': '{question}', 'template_format': 'f-string', 'partial_variables': {}}}}}], 'input_variables': ['question']}}, {'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'chat_models', 'fake', 'FakeListChatModel'], 'repr': "FakeListChatModel(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, responses=['baz, qux'], sleep=None, i=0)"}], 'last': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'output_parsers', 'list', 'CommaSeparatedListOutputParser'], 'kwargs': {}}}}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'question': 'What is your name?'}, outputs={'output': ['baz', 'qux']}, reference_example_id=None, parent_run_id=None, tags=[], execution_order=None, child_execution_order=None, child_runs=[Run(id=UUID('00000000-0000-4000-8000-000000000001'), name='ChatPromptTemplate', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='prompt', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptTemplate'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'SystemMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': [], 'template': 'You are a nice assistant.', 'template_format': 'f-string', 'partial_variables': {}}}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'HumanMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': ['question'], 'template': '{question}', 'template_format': 'f-string', 'partial_variables': {}}}}}], 'input_variables': ['question']}}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'question': 'What is your name?'}, outputs={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptValue'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'SystemMessage'], 'kwargs': {'content': 'You are a nice assistant.', 'additional_kwargs': {}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is your name?', 'additional_kwargs': {}}}]}}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:1'], execution_order=None, child_execution_order=None, child_runs=[]), Run(id=UUID('00000000-0000-4000-8000-000000000002'), name='FakeListChatModel', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='llm', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={'invocation_params': {'responses': ['foo, bar'], '_type': 'fake-list-chat-model', 'stop': None}, 'options': {'stop': None}}, error=None, serialized={'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'chat_models', 'fake', 'FakeListChatModel'], 'repr': "FakeListChatModel(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, responses=['foo, bar'], sleep=None, i=0)"}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'prompts': ['System: You are a nice assistant.\nHuman: What is your name?']}, outputs={'generations': [[{'text': 'foo, bar', 'generation_info': None, 'message': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': 'foo, bar'}}}]], 'llm_output': None, 'run': None}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:2'], execution_order=None, child_execution_order=None, child_runs=[]), Run(id=UUID('00000000-0000-4000-8000-000000000003'), name='CommaSeparatedListOutputParser', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='parser', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'output_parsers', 'list', 'CommaSeparatedListOutputParser'], 'kwargs': {}}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'input': AIMessage(content='foo, bar', additional_kwargs={}, example=False)}, outputs={'output': ['foo', 'bar']}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:3'], execution_order=None, child_execution_order=None, child_runs=[]), Run(id=UUID('00000000-0000-4000-8000-000000000004'), name='RunnableLambda', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='chain', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'schema', 'runnable', 'base', 'RunnableLambda'], 'repr': 'RunnableLambda(...)'}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'input': ['foo', 'bar']}, outputs={'question': 'foobar'}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:4'], execution_order=None, child_execution_order=None, child_runs=[]), Run(id=UUID('00000000-0000-4000-8000-000000000005'), name='ChatPromptTemplate', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='prompt', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptTemplate'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'SystemMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': [], 'template': 'You are a nicer assistant.', 'template_format': 'f-string', 'partial_variables': {}}}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'HumanMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': ['question'], 'template': '{question}', 'template_format': 'f-string', 'partial_variables': {}}}}}], 'input_variables': ['question']}}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'question': 'foobar'}, outputs={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptValue'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'SystemMessage'], 'kwargs': {'content': 'You are a nicer assistant.', 'additional_kwargs': {}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'foobar', 'additional_kwargs': {}}}]}}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:5'], execution_order=None, child_execution_order=None, child_runs=[]), Run(id=UUID('00000000-0000-4000-8000-000000000006'), name='FakeListChatModel', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='llm', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={'invocation_params': {'responses': ['baz, qux'], '_type': 'fake-list-chat-model', 'stop': None}, 'options': {'stop': None}}, error=None, serialized={'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'chat_models', 'fake', 'FakeListChatModel'], 'repr': "FakeListChatModel(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, responses=['baz, qux'], sleep=None, i=0)"}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'prompts': ['System: You are a nicer assistant.\nHuman: foobar']}, outputs={'generations': [[{'text': 'baz, qux', 'generation_info': None, 'message': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': 'baz, qux'}}}]], 'llm_output': None, 'run': None}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:6'], execution_order=None, child_execution_order=None, child_runs=[]), Run(id=UUID('00000000-0000-4000-8000-000000000007'), name='CommaSeparatedListOutputParser', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='parser', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'output_parsers', 'list', 'CommaSeparatedListOutputParser'], 'kwargs': {}}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'input': AIMessage(content='baz, qux', additional_kwargs={}, example=False)}, outputs={'output': ['baz', 'qux']}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:7'], execution_order=None, child_execution_order=None, child_runs=[])]), + Run(id=UUID('00000000-0000-4000-8000-000000000000'), name='RunnableSequence', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='chain', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'runnable', 'RunnableSequence'], 'kwargs': {'first': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptTemplate'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'SystemMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': [], 'template': 'You are a nice assistant.', 'template_format': 'f-string', 'partial_variables': {}}}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'HumanMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': ['question'], 'template': '{question}', 'template_format': 'f-string', 'partial_variables': {}}}}}], 'input_variables': ['question']}}, 'middle': [{'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'chat_models', 'fake', 'FakeListChatModel'], 'repr': "FakeListChatModel(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, responses=['foo, bar'], sleep=None, i=0)"}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'output_parsers', 'list', 'CommaSeparatedListOutputParser'], 'kwargs': {}}, {'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'schema', 'runnable', 'base', 'RunnableLambda'], 'repr': 'RunnableLambda(...)'}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptTemplate'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'SystemMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': [], 'template': 'You are a nicer assistant.', 'template_format': 'f-string', 'partial_variables': {}}}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'HumanMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': ['question'], 'template': '{question}', 'template_format': 'f-string', 'partial_variables': {}}}}}], 'input_variables': ['question']}}, {'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'chat_models', 'fake', 'FakeListChatModel'], 'repr': "FakeListChatModel(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, responses=['baz, qux'], sleep=None, i=0)"}], 'last': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'output_parsers', 'list', 'CommaSeparatedListOutputParser'], 'kwargs': {}}}}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'question': 'What is your name?'}, outputs={'output': ['baz', 'qux']}, reference_example_id=None, parent_run_id=None, tags=[], execution_order=None, child_execution_order=None, child_runs=[Run(id=UUID('00000000-0000-4000-8000-000000000001'), name='ChatPromptTemplate', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='prompt', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptTemplate'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'SystemMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': [], 'template': 'You are a nice assistant.', 'template_format': 'f-string', 'partial_variables': {}}}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'HumanMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': ['question'], 'template': '{question}', 'template_format': 'f-string', 'partial_variables': {}}}}}], 'input_variables': ['question']}}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'question': 'What is your name?'}, outputs={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptValue'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'SystemMessage'], 'kwargs': {'content': 'You are a nice assistant.', 'additional_kwargs': {}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is your name?', 'additional_kwargs': {}}}]}}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:1'], execution_order=None, child_execution_order=None, child_runs=[]), Run(id=UUID('00000000-0000-4000-8000-000000000002'), name='FakeListChatModel', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='llm', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={'invocation_params': {'responses': ['foo, bar'], '_type': 'fake-list-chat-model', 'stop': None}, 'options': {'stop': None}}, error=None, serialized={'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'chat_models', 'fake', 'FakeListChatModel'], 'repr': "FakeListChatModel(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, responses=['foo, bar'], sleep=None, i=0)"}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'prompts': ['System: You are a nice assistant.\nHuman: What is your name?']}, outputs={'generations': [[{'text': 'foo, bar', 'generation_info': None, 'message': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': 'foo, bar'}}}]], 'llm_output': None, 'run': None}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:2'], execution_order=None, child_execution_order=None, child_runs=[]), Run(id=UUID('00000000-0000-4000-8000-000000000003'), name='CommaSeparatedListOutputParser', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='parser', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'output_parsers', 'list', 'CommaSeparatedListOutputParser'], 'kwargs': {}}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'input': AIMessage(content='foo, bar', additional_kwargs={}, example=False)}, outputs={'output': ['foo', 'bar']}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:3'], execution_order=None, child_execution_order=None, child_runs=[]), Run(id=UUID('00000000-0000-4000-8000-000000000004'), name='<lambda>', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='chain', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'schema', 'runnable', 'base', 'RunnableLambda'], 'repr': 'RunnableLambda(...)'}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'input': ['foo', 'bar']}, outputs={'question': 'foobar'}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:4'], execution_order=None, child_execution_order=None, child_runs=[]), Run(id=UUID('00000000-0000-4000-8000-000000000005'), name='ChatPromptTemplate', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='prompt', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptTemplate'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'SystemMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': [], 'template': 'You are a nicer assistant.', 'template_format': 'f-string', 'partial_variables': {}}}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'HumanMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': ['question'], 'template': '{question}', 'template_format': 'f-string', 'partial_variables': {}}}}}], 'input_variables': ['question']}}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'question': 'foobar'}, outputs={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptValue'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'SystemMessage'], 'kwargs': {'content': 'You are a nicer assistant.', 'additional_kwargs': {}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'foobar', 'additional_kwargs': {}}}]}}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:5'], execution_order=None, child_execution_order=None, child_runs=[]), Run(id=UUID('00000000-0000-4000-8000-000000000006'), name='FakeListChatModel', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='llm', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={'invocation_params': {'responses': ['baz, qux'], '_type': 'fake-list-chat-model', 'stop': None}, 'options': {'stop': None}}, error=None, serialized={'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'chat_models', 'fake', 'FakeListChatModel'], 'repr': "FakeListChatModel(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, responses=['baz, qux'], sleep=None, i=0)"}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'prompts': ['System: You are a nicer assistant.\nHuman: foobar']}, outputs={'generations': [[{'text': 'baz, qux', 'generation_info': None, 'message': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'AIMessage'], 'kwargs': {'content': 'baz, qux'}}}]], 'llm_output': None, 'run': None}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:6'], execution_order=None, child_execution_order=None, child_runs=[]), Run(id=UUID('00000000-0000-4000-8000-000000000007'), name='CommaSeparatedListOutputParser', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='parser', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'output_parsers', 'list', 'CommaSeparatedListOutputParser'], 'kwargs': {}}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'input': AIMessage(content='baz, qux', additional_kwargs={}, example=False)}, outputs={'output': ['baz', 'qux']}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:7'], execution_order=None, child_execution_order=None, child_runs=[])]), ]) # --- # name: test_each @@ -1407,7 +1407,7 @@ # --- # name: test_prompt_with_llm_and_async_lambda.1 list([ - Run(id=UUID('00000000-0000-4000-8000-000000000000'), name='RunnableSequence', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='chain', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'runnable', 'RunnableSequence'], 'kwargs': {'first': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptTemplate'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'SystemMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': [], 'template': 'You are a nice assistant.', 'template_format': 'f-string', 'partial_variables': {}}}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'HumanMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': ['question'], 'template': '{question}', 'template_format': 'f-string', 'partial_variables': {}}}}}], 'input_variables': ['question']}}, 'middle': [{'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'llms', 'fake', 'FakeListLLM'], 'repr': "FakeListLLM(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, responses=['foo', 'bar'], sleep=None, i=0)"}], 'last': {'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'schema', 'runnable', 'base', 'RunnableLambda'], 'repr': 'RunnableLambda(...)'}}}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'question': 'What is your name?'}, outputs={'output': 'foo'}, reference_example_id=None, parent_run_id=None, tags=[], execution_order=None, child_execution_order=None, child_runs=[Run(id=UUID('00000000-0000-4000-8000-000000000001'), name='ChatPromptTemplate', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='prompt', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptTemplate'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'SystemMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': [], 'template': 'You are a nice assistant.', 'template_format': 'f-string', 'partial_variables': {}}}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'HumanMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': ['question'], 'template': '{question}', 'template_format': 'f-string', 'partial_variables': {}}}}}], 'input_variables': ['question']}}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'question': 'What is your name?'}, outputs={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptValue'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'SystemMessage'], 'kwargs': {'content': 'You are a nice assistant.', 'additional_kwargs': {}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is your name?', 'additional_kwargs': {}}}]}}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:1'], execution_order=None, child_execution_order=None, child_runs=[]), Run(id=UUID('00000000-0000-4000-8000-000000000002'), name='FakeListLLM', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='llm', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={'invocation_params': {'responses': ['foo', 'bar'], '_type': 'fake-list', 'stop': None}, 'options': {'stop': None}}, error=None, serialized={'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'llms', 'fake', 'FakeListLLM'], 'repr': "FakeListLLM(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, responses=['foo', 'bar'], sleep=None, i=0)"}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'prompts': ['System: You are a nice assistant.\nHuman: What is your name?']}, outputs={'generations': [[{'text': 'foo', 'generation_info': None}]], 'llm_output': None, 'run': None}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:2'], execution_order=None, child_execution_order=None, child_runs=[]), Run(id=UUID('00000000-0000-4000-8000-000000000003'), name='RunnableLambda', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='chain', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'schema', 'runnable', 'base', 'RunnableLambda'], 'repr': 'RunnableLambda(...)'}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'input': 'foo'}, outputs={'output': 'foo'}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:3'], execution_order=None, child_execution_order=None, child_runs=[])]), + Run(id=UUID('00000000-0000-4000-8000-000000000000'), name='RunnableSequence', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='chain', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'runnable', 'RunnableSequence'], 'kwargs': {'first': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptTemplate'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'SystemMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': [], 'template': 'You are a nice assistant.', 'template_format': 'f-string', 'partial_variables': {}}}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'HumanMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': ['question'], 'template': '{question}', 'template_format': 'f-string', 'partial_variables': {}}}}}], 'input_variables': ['question']}}, 'middle': [{'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'llms', 'fake', 'FakeListLLM'], 'repr': "FakeListLLM(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, responses=['foo', 'bar'], sleep=None, i=0)"}], 'last': {'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'schema', 'runnable', 'base', 'RunnableLambda'], 'repr': 'RunnableLambda(...)'}}}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'question': 'What is your name?'}, outputs={'output': 'foo'}, reference_example_id=None, parent_run_id=None, tags=[], execution_order=None, child_execution_order=None, child_runs=[Run(id=UUID('00000000-0000-4000-8000-000000000001'), name='ChatPromptTemplate', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='prompt', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptTemplate'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'SystemMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': [], 'template': 'You are a nice assistant.', 'template_format': 'f-string', 'partial_variables': {}}}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'HumanMessagePromptTemplate'], 'kwargs': {'prompt': {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'prompt', 'PromptTemplate'], 'kwargs': {'input_variables': ['question'], 'template': '{question}', 'template_format': 'f-string', 'partial_variables': {}}}}}], 'input_variables': ['question']}}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'question': 'What is your name?'}, outputs={'lc': 1, 'type': 'constructor', 'id': ['langchain', 'prompts', 'chat', 'ChatPromptValue'], 'kwargs': {'messages': [{'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'SystemMessage'], 'kwargs': {'content': 'You are a nice assistant.', 'additional_kwargs': {}}}, {'lc': 1, 'type': 'constructor', 'id': ['langchain', 'schema', 'messages', 'HumanMessage'], 'kwargs': {'content': 'What is your name?', 'additional_kwargs': {}}}]}}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:1'], execution_order=None, child_execution_order=None, child_runs=[]), Run(id=UUID('00000000-0000-4000-8000-000000000002'), name='FakeListLLM', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='llm', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={'invocation_params': {'responses': ['foo', 'bar'], '_type': 'fake-list', 'stop': None}, 'options': {'stop': None}}, error=None, serialized={'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'llms', 'fake', 'FakeListLLM'], 'repr': "FakeListLLM(cache=None, verbose=False, callbacks=None, callback_manager=None, tags=None, metadata=None, responses=['foo', 'bar'], sleep=None, i=0)"}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'prompts': ['System: You are a nice assistant.\nHuman: What is your name?']}, outputs={'generations': [[{'text': 'foo', 'generation_info': None}]], 'llm_output': None, 'run': None}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:2'], execution_order=None, child_execution_order=None, child_runs=[]), Run(id=UUID('00000000-0000-4000-8000-000000000003'), name='passthrough', start_time=FakeDatetime(2023, 1, 1, 0, 0), run_type='chain', end_time=FakeDatetime(2023, 1, 1, 0, 0), extra={}, error=None, serialized={'lc': 1, 'type': 'not_implemented', 'id': ['langchain', 'schema', 'runnable', 'base', 'RunnableLambda'], 'repr': 'RunnableLambda(...)'}, events=[{'name': 'start', 'time': FakeDatetime(2023, 1, 1, 0, 0)}, {'name': 'end', 'time': FakeDatetime(2023, 1, 1, 0, 0)}], inputs={'input': 'foo'}, outputs={'output': 'foo'}, reference_example_id=None, parent_run_id=UUID('00000000-0000-4000-8000-000000000000'), tags=['seq:step:3'], execution_order=None, child_execution_order=None, child_runs=[])]), ]) # --- # name: test_router_runnable diff --git a/libs/langchain/tests/unit_tests/schema/runnable/test_runnable.py b/libs/langchain/tests/unit_tests/schema/runnable/test_runnable.py index 2e0be35ddc..98bf284fd0 100644 --- a/libs/langchain/tests/unit_tests/schema/runnable/test_runnable.py +++ b/libs/langchain/tests/unit_tests/schema/runnable/test_runnable.py @@ -948,7 +948,7 @@ async def test_higher_order_lambda_runnable( parent_run = next(r for r in tracer.runs if r.parent_run_id is None) assert len(parent_run.child_runs) == 2 router_run = parent_run.child_runs[1] - assert router_run.name == "RunnableLambda" + assert router_run.name == "router" assert len(router_run.child_runs) == 1 math_run = router_run.child_runs[0] assert math_run.name == "RunnableSequence" @@ -980,7 +980,7 @@ async def test_higher_order_lambda_runnable( parent_run = next(r for r in tracer.runs if r.parent_run_id is None) assert len(parent_run.child_runs) == 2 router_run = parent_run.child_runs[1] - assert router_run.name == "RunnableLambda" + assert router_run.name == "arouter" assert len(router_run.child_runs) == 1 math_run = router_run.child_runs[0] assert math_run.name == "RunnableSequence" @@ -1507,7 +1507,7 @@ async def test_async_retrying(mocker: MockerFixture) -> None: with pytest.raises(ValueError): await runnable.with_retry( stop_after_attempt=2, - retry_if_exception_type=(ValueError,), + retry_if_exception_type=(ValueError, KeyError), ).ainvoke(1) assert _lambda_mock.call_count == 2 # retried diff --git a/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py b/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py index 914958031d..825c61e74e 100644 --- a/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py +++ b/libs/langchain/tests/unit_tests/smith/evaluation/test_runner_utils.py @@ -181,11 +181,15 @@ def test_run_llm_or_chain_with_input_mapper() -> None: assert "the wrong input" in inputs return {"the right input": inputs["the wrong input"]} - result = _run_llm_or_chain(example, lambda: mock_chain, input_mapper=input_mapper) + result = _run_llm_or_chain( + example, + {"callbacks": [], "tags": []}, + llm_or_chain_factory=lambda: mock_chain, + input_mapper=input_mapper, + ) assert result == {"output": "2", "the right input": "1"} bad_result = _run_llm_or_chain( - example, - lambda: mock_chain, + example, {"callbacks": [], "tags": []}, llm_or_chain_factory=lambda: mock_chain ) assert "Error" in bad_result @@ -195,7 +199,12 @@ def test_run_llm_or_chain_with_input_mapper() -> None: return "the right input" mock_llm = FakeLLM(queries={"the right input": "somenumber"}) - llm_result = _run_llm_or_chain(example, mock_llm, input_mapper=llm_input_mapper) + llm_result = _run_llm_or_chain( + example, + {"callbacks": [], "tags": []}, + llm_or_chain_factory=mock_llm, + input_mapper=llm_input_mapper, + ) assert isinstance(llm_result, str) assert llm_result == "somenumber" @@ -324,10 +333,14 @@ async def test_arun_on_dataset(monkeypatch: pytest.MonkeyPatch) -> None: ) expected = { - uuid_: { - "output": {"result": f"Result for example {uuid.UUID(uuid_)}"}, + str(example.id): { + "output": { + "result": f"Result for example {uuid.UUID(str(example.id))}" + }, + "input": {"input": example.inputs["input"]}, + "reference": {"output": example.outputs["output"]}, "feedback": [], } - for uuid_ in uuids + for example in examples } assert results["results"] == expected diff --git a/libs/langchain/tests/unit_tests/test_text_splitter.py b/libs/langchain/tests/unit_tests/test_text_splitter.py index 39d07b0152..717b28c242 100644 --- a/libs/langchain/tests/unit_tests/test_text_splitter.py +++ b/libs/langchain/tests/unit_tests/test_text_splitter.py @@ -498,6 +498,73 @@ public class HelloWorld { ] +def test_csharp_code_splitter() -> None: + splitter = RecursiveCharacterTextSplitter.from_language( + Language.CSHARP, chunk_size=CHUNK_SIZE, chunk_overlap=0 + ) + code = """ +using System; +class Program +{ + static void Main() + { + int age = 30; // Change the age value as needed + + // Categorize the age without any console output + if (age < 18) + { + // Age is under 18 + } + else if (age >= 18 && age < 65) + { + // Age is an adult + } + else + { + // Age is a senior citizen + } + } +} + """ + + chunks = splitter.split_text(code) + assert chunks == [ + "using System;", + "class Program\n{", + "static void", + "Main()", + "{", + "int age", + "= 30; // Change", + "the age value", + "as needed", + "//", + "Categorize the", + "age without any", + "console output", + "if (age", + "< 18)", + "{", + "//", + "Age is under 18", + "}", + "else if", + "(age >= 18 &&", + "age < 65)", + "{", + "//", + "Age is an adult", + "}", + "else", + "{", + "//", + "Age is a senior", + "citizen", + "}\n }", + "}", + ] + + def test_cpp_code_splitter() -> None: splitter = RecursiveCharacterTextSplitter.from_language( Language.CPP, chunk_size=CHUNK_SIZE, chunk_overlap=0 diff --git a/tests/integration_tests/vectorstores/test_vearch.py b/tests/integration_tests/vectorstores/test_vearch.py new file mode 100644 index 0000000000..a6827b4b85 --- /dev/null +++ b/tests/integration_tests/vectorstores/test_vearch.py @@ -0,0 +1,97 @@ +from langchain.docstore.document import Document +from langchain.vectorstores.vearch import VearchDb +from tests.integration_tests.vectorstores.fake_embeddings import FakeEmbeddings + + +def test_vearch() -> None: + """ + Test end to end create vearch ,store vector into it and search + """ + texts = [ + "Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用", + "Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库", + "vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装", + ] + metadatas = [ + { + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + { + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + { + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + ] + vearch_db = VearchDb.from_texts( + texts=texts, + embedding=FakeEmbeddings(), + metadatas=metadatas, + table_name="test_vearch", + metadata_path="./", + ) + result = vearch_db.similarity_search( + "Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库", 1 + ) + assert result == [ + Document( + page_content="Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库", + metadata={ + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + ) + ] + + +def test_vearch_add_texts() -> None: + """Test end to end adding of texts.""" + texts = [ + "Vearch 是一款存储大语言模型数据的向量数据库,用于存储和快速搜索模型embedding后的向量,可用于基于个人知识库的大模型应用", + "Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库", + "vearch 是基于C语言,go语言开发的,并提供python接口,可以直接通过pip安装", + ] + + metadatas = [ + { + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + { + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + { + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + ] + vearch_db = VearchDb.from_texts( + texts=texts, + embedding=FakeEmbeddings(), + metadatas=metadatas, + table_name="test_vearch", + metadata_path="./", + ) + + vearch_db.add_texts( + texts=["Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库"], + metadatas={ + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + ) + result = vearch_db.similarity_search( + "Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库", 2 + ) + + assert result == [ + Document( + page_content="Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库", + metadata={ + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + ), + Document( + page_content="Vearch 支持OpenAI, Llama, ChatGLM等模型,以及LangChain库", + metadata={ + "source": "/data/zhx/zhx/langchain-ChatGLM_new/knowledge_base/santi/three_body.txt" + }, + ), + ] +