Merge branch 'master' into deepsense/text-to-speech

pull/10181/head
mateusz.wosinski 1 year ago
commit 868db99b17

@ -39,10 +39,35 @@ runs:
with:
path: |
/opt/pipx/venvs/poetry
/opt/pipx_bin/poetry
# This step caches the poetry installation, so make sure it's keyed on the poetry version as well.
key: bin-poetry-${{ runner.os }}-${{ runner.arch }}-py-${{ inputs.python-version }}-${{ inputs.poetry-version }}
- name: Refresh shell hashtable and fixup softlinks
if: steps.cache-bin-poetry.outputs.cache-hit == 'true'
shell: bash
env:
POETRY_VERSION: ${{ inputs.poetry-version }}
PYTHON_VERSION: ${{ inputs.python-version }}
run: |
set -eux
# Refresh the shell hashtable, to ensure correct `which` output.
hash -r
# `actions/cache@v3` doesn't always seem able to correctly unpack softlinks.
# Delete and recreate the softlinks pipx expects to have.
rm /opt/pipx/venvs/poetry/bin/python
cd /opt/pipx/venvs/poetry/bin
ln -s "$(which "python$PYTHON_VERSION")" python
chmod +x python
cd /opt/pipx_bin/
ln -s /opt/pipx/venvs/poetry/bin/poetry poetry
chmod +x poetry
# Ensure everything got set up correctly.
/opt/pipx/venvs/poetry/bin/python --version
/opt/pipx_bin/poetry --version
- name: Install poetry
if: steps.cache-bin-poetry.outputs.cache-hit != 'true'
shell: bash

@ -87,7 +87,7 @@ jobs:
python-version: ${{ matrix.python-version }}
poetry-version: ${{ env.POETRY_VERSION }}
working-directory: ${{ inputs.working-directory }}
cache-key: lint
cache-key: lint-with-extras
- name: Check Poetry File
shell: bash
@ -102,9 +102,17 @@ jobs:
poetry lock --check
- name: Install dependencies
# Also installs dev/lint/test/typing dependencies, to ensure we have
# type hints for as many of our libraries as possible.
# This helps catch errors that require dependencies to be spotted, for example:
# https://github.com/langchain-ai/langchain/pull/10249/files#diff-935185cd488d015f026dcd9e19616ff62863e8cde8c0bee70318d3ccbca98341
#
# If you change this configuration, make sure to change the `cache-key`
# in the `poetry_setup` action above to stop using the old cache.
# It doesn't matter how you change it, any change will cause a cache-bust.
working-directory: ${{ inputs.working-directory }}
run: |
poetry install
poetry install --with dev,lint,test,typing
- name: Install langchain editable
working-directory: ${{ inputs.working-directory }}

@ -79,3 +79,15 @@ jobs:
- name: Run pydantic compatibility tests
shell: bash
run: make test
- name: Ensure the tests did not create any additional files
shell: bash
run: |
set -eu
STATUS="$(git status)"
echo "$STATUS"
# grep will exit non-zero if the target message isn't found,
# and `set -e` above will cause the step to fail.
echo "$STATUS" | grep 'nothing to commit, working tree clean'

@ -43,3 +43,15 @@ jobs:
- name: Run core tests
shell: bash
run: make test
- name: Ensure the tests did not create any additional files
shell: bash
run: |
set -eu
STATUS="$(git status)"
echo "$STATUS"
# grep will exit non-zero if the target message isn't found,
# and `set -e` above will cause the step to fail.
echo "$STATUS" | grep 'nothing to commit, working tree clean'

@ -6,6 +6,8 @@ on:
branches: [ master ]
pull_request:
paths:
- '.github/actions/poetry_setup/action.yml'
- '.github/tools/**'
- '.github/workflows/_lint.yml'
- '.github/workflows/_test.yml'
- '.github/workflows/_pydantic_compatibility.yml'
@ -81,3 +83,15 @@ jobs:
- name: Run extended tests
run: make extended_tests
- name: Ensure the tests did not create any additional files
shell: bash
run: |
set -eu
STATUS="$(git status)"
echo "$STATUS"
# grep will exit non-zero if the target message isn't found,
# and `set -e` above will cause the step to fail.
echo "$STATUS" | grep 'nothing to commit, working tree clean'

@ -6,6 +6,8 @@ on:
branches: [ master ]
pull_request:
paths:
- '.github/actions/poetry_setup/action.yml'
- '.github/tools/**'
- '.github/workflows/_lint.yml'
- '.github/workflows/_test.yml'
- '.github/workflows/langchain_experimental_ci.yml'
@ -113,3 +115,15 @@ jobs:
- name: Run extended tests
run: make extended_tests
- name: Ensure the tests did not create any additional files
shell: bash
run: |
set -eu
STATUS="$(git status)"
echo "$STATUS"
# grep will exit non-zero if the target message isn't found,
# and `set -e` above will cause the step to fail.
echo "$STATUS" | grep 'nothing to commit, working tree clean'

@ -47,3 +47,15 @@ jobs:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
make scheduled_tests
- name: Ensure the tests did not create any additional files
shell: bash
run: |
set -eu
STATUS="$(git status)"
echo "$STATUS"
# grep will exit non-zero if the target message isn't found,
# and `set -e` above will cause the step to fail.
echo "$STATUS" | grep 'nothing to commit, working tree clean'

@ -317,7 +317,7 @@
"Chatbots": "https://python.langchain.com/docs/use_cases/chatbots",
"Summarization": "https://python.langchain.com/docs/use_cases/summarization",
"Extraction": "https://python.langchain.com/docs/use_cases/extraction",
"SQL": "https://python.langchain.com/docs/use_cases/sql",
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql",
"Tagging": "https://python.langchain.com/docs/use_cases/tagging",
"Code Understanding": "https://python.langchain.com/docs/use_cases/code_understanding",
"AutoGPT": "https://python.langchain.com/docs/use_cases/autonomous_agents/autogpt",
@ -400,7 +400,7 @@
"Summarization": "https://python.langchain.com/docs/use_cases/summarization",
"Extraction": "https://python.langchain.com/docs/use_cases/extraction",
"Interacting with APIs": "https://python.langchain.com/docs/use_cases/apis",
"SQL": "https://python.langchain.com/docs/use_cases/sql",
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql",
"QA over Documents": "https://python.langchain.com/docs/use_cases/question_answering/index",
"Retrieve from vector stores directly": "https://python.langchain.com/docs/use_cases/question_answering/how_to/vector_db_text_generation",
"Improve document indexing with HyDE": "https://python.langchain.com/docs/use_cases/question_answering/how_to/hyde",
@ -641,7 +641,7 @@
"Chatbots": "https://python.langchain.com/docs/use_cases/chatbots",
"Extraction": "https://python.langchain.com/docs/use_cases/extraction",
"Interacting with APIs": "https://python.langchain.com/docs/use_cases/apis",
"SQL": "https://python.langchain.com/docs/use_cases/sql",
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql",
"HuggingGPT": "https://python.langchain.com/docs/use_cases/autonomous_agents/hugginggpt",
"Perform context-aware text splitting": "https://python.langchain.com/docs/use_cases/question_answering/how_to/document-context-aware-QA",
"Retrieve from vector stores directly": "https://python.langchain.com/docs/use_cases/question_answering/how_to/vector_db_text_generation",
@ -1009,7 +1009,7 @@
"LangSmith Walkthrough": "https://python.langchain.com/docs/guides/langsmith/walkthrough",
"Comparing Chain Outputs": "https://python.langchain.com/docs/guides/evaluation/examples/comparisons",
"Agent Trajectory": "https://python.langchain.com/docs/guides/evaluation/trajectory/trajectory_eval",
"SQL": "https://python.langchain.com/docs/use_cases/sql",
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql",
"Multi-modal outputs: Image & Text": "https://python.langchain.com/docs/use_cases/multi_modal/image_agent",
"Agent Debates with Tools": "https://python.langchain.com/docs/use_cases/agent_simulations/two_agent_debate_tools",
"Multiple callback handlers": "https://python.langchain.com/docs/modules/callbacks/multiple_callbacks",
@ -1268,7 +1268,7 @@
"SQL Database Agent": "https://python.langchain.com/docs/integrations/toolkits/sql_database",
"JSON Agent": "https://python.langchain.com/docs/integrations/toolkits/json",
"NIBittensorLLM": "https://python.langchain.com/docs/integrations/llms/bittensor",
"SQL": "https://python.langchain.com/docs/use_cases/sql",
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql",
"BabyAGI with Tools": "https://python.langchain.com/docs/use_cases/agents/baby_agi_with_agent",
"Conversational Retrieval Agent": "https://python.langchain.com/docs/use_cases/question_answering/how_to/conversational_retrieval_agents",
"Plug-and-Plai": "https://python.langchain.com/docs/use_cases/agents/custom_agent_with_plugin_retrieval_using_plugnplai",
@ -1832,12 +1832,12 @@
"create_sql_agent": {
"CnosDB": "https://python.langchain.com/docs/integrations/providers/cnosdb",
"SQL Database Agent": "https://python.langchain.com/docs/integrations/toolkits/sql_database",
"SQL": "https://python.langchain.com/docs/use_cases/sql"
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql"
},
"SQLDatabaseToolkit": {
"CnosDB": "https://python.langchain.com/docs/integrations/providers/cnosdb",
"SQL Database Agent": "https://python.langchain.com/docs/integrations/toolkits/sql_database",
"SQL": "https://python.langchain.com/docs/use_cases/sql",
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql",
"Use ToolKits with OpenAI Functions": "https://python.langchain.com/docs/modules/agents/how_to/use_toolkits_with_openai_functions"
},
"SageMakerCallbackHandler": {
@ -1899,7 +1899,7 @@
"Rebuff": "https://python.langchain.com/docs/integrations/providers/rebuff",
"SQL Database Agent": "https://python.langchain.com/docs/integrations/toolkits/sql_database",
"Cookbook": "https://python.langchain.com/docs/guides/expression_language/cookbook",
"SQL": "https://python.langchain.com/docs/use_cases/sql",
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql",
"Multiple Retrieval Sources": "https://python.langchain.com/docs/use_cases/question_answering/how_to/multiple_retrieval"
},
"Weaviate": {
@ -3035,11 +3035,11 @@
"Interacting with APIs": "https://python.langchain.com/docs/use_cases/apis"
},
"create_sql_query_chain": {
"SQL": "https://python.langchain.com/docs/use_cases/sql",
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql",
"Multiple Retrieval Sources": "https://python.langchain.com/docs/use_cases/question_answering/how_to/multiple_retrieval"
},
"ElasticsearchDatabaseChain": {
"SQL": "https://python.langchain.com/docs/use_cases/sql"
"SQL": "https://python.langchain.com/docs/use_cases/qa_structured/sql"
},
"FileChatMessageHistory": {
"AutoGPT": "https://python.langchain.com/docs/use_cases/autonomous_agents/autogpt"

@ -2,11 +2,21 @@
import DocCardList from "@theme/DocCardList";
LangSmith helps you trace and evaluate your language model applications and intelligent agents to help you
[LangSmith](https://smith.langchain.com) helps you trace and evaluate your language model applications and intelligent agents to help you
move from prototype to production.
Check out the [interactive walkthrough](/docs/guides/langsmith/walkthrough) below to get started.
For more information, please refer to the [LangSmith documentation](https://docs.smith.langchain.com/)
For more information, please refer to the [LangSmith documentation](https://docs.smith.langchain.com/).
For tutorials and other end-to-end examples demonstrating ways to integrate LangSmith in your workflow,
check out the [LangSmith Cookbook](https://github.com/langchain-ai/langsmith-cookbook). Some of the guides therein include:
- Leveraging user feedback in your JS application ([link](https://github.com/langchain-ai/langsmith-cookbook/blob/main/feedback-examples/nextjs/README.md)).
- Building an automated feedback pipeline ([link](https://github.com/langchain-ai/langsmith-cookbook/blob/main/feedback-examples/algorithmic-feedback/algorithmic_feedback.ipynb)).
- How to evaluate and audit your RAG workflows ([link](https://github.com/langchain-ai/langsmith-cookbook/tree/main/testing-examples/qa-correctness)).
- How to fine-tune a LLM on real usage data ([link](https://github.com/langchain-ai/langsmith-cookbook/blob/main/fine-tuning-examples/export-to-openai/fine-tuning-on-chat-runs.ipynb)).
- How to use the [LangChain Hub](https://smith.langchain.com/hub) to version your prompts ([link](https://github.com/langchain-ai/langsmith-cookbook/blob/main/hub-examples/retrieval-qa-chain/retrieval-qa.ipynb))
<DocCardList />

@ -1,6 +1,6 @@
# Conversation Buffer
This notebook shows how to use `ConversationBufferMemory`. This memory allows for storing of messages and then extracts the messages in a variable.
This notebook shows how to use `ConversationBufferMemory`. This memory allows for storing messages and then extracts the messages in a variable.
We can first extract it as a string.

@ -1,9 +0,0 @@
---
sidebar_position: 3
---
# Web Scraping
Web scraping has historically been a challenging endeavor due to the ever-changing nature of website structures, making it tedious for developers to maintain their scraping scripts. Traditional methods often rely on specific HTML tags and patterns which, when altered, can disrupt data extraction processes.
Enter the LLM-based method for parsing HTML: By leveraging the capabilities of LLMs, and especially OpenAI Functions in LangChain's extraction chain, developers can instruct the model to extract only the desired data in a specified format. This method not only streamlines the extraction process but also significantly reduces the time spent on manual debugging and script modifications. Its adaptability means that even if websites undergo significant design changes, the extraction remains consistent and robust. This level of resilience translates to reduced maintenance efforts, cost savings, and ensures a higher quality of extracted data. Compared to its predecessors, the LLM-based approach wins out in the web scraping domain by transforming a historically cumbersome task into a more automated and efficient process.

@ -3178,7 +3178,11 @@
},
{
"source": "/en/latest/use_cases/tabular.html",
"destination": "/docs/use_cases/tabular"
"destination": "/docs/use_cases/qa_structured"
},
{
"source": "/docs/use_cases/sql(/?)",
"destination": "/docs/use_cases/qa_structured/sql"
},
{
"source": "/en/latest/youtube.html",
@ -3370,7 +3374,7 @@
},
{
"source": "/docs/modules/chains/popular/sqlite",
"destination": "/docs/use_cases/tabular/sqlite"
"destination": "/docs/use_cases/qa_structured/sql"
},
{
"source": "/docs/modules/chains/popular/openai_functions",
@ -3582,7 +3586,7 @@
},
{
"source": "/docs/modules/chains/additional/elasticsearch_database",
"destination": "/docs/use_cases/tabular/elasticsearch_database"
"destination": "/docs/use_cases/qa_structured/integrations/elasticsearch"
},
{
"source": "/docs/modules/chains/additional/tagging",

@ -1,6 +1,6 @@
# YouTube videos
⛓ icon marks a new addition [last update 2023-06-20]
⛓ icon marks a new addition [last update 2023-09-05]
### [Official LangChain YouTube channel](https://www.youtube.com/@LangChain)
@ -86,20 +86,20 @@
- [`Llama Index`: Chat with Documentation using URL Loader](https://youtu.be/XJRoDEctAwA) by [Merk](https://www.youtube.com/@merksworld)
- [Using OpenAI, LangChain, and `Gradio` to Build Custom GenAI Applications](https://youtu.be/1MsmqMg3yUc) by [David Hundley](https://www.youtube.com/@dkhundley)
- [LangChain, Chroma DB, OpenAI Beginner Guide | ChatGPT with your PDF](https://youtu.be/FuqdVNB_8c0)
- [Build AI chatbot with custom knowledge base using OpenAI API and GPT Index](https://youtu.be/vDZAZuaXf48) by [Irina Nik](https://www.youtube.com/@irina_nik)
- [Build Your Own Auto-GPT Apps with LangChain (Python Tutorial)](https://youtu.be/NYSWn1ipbgg) by [Dave Ebbelaar](https://www.youtube.com/@daveebbelaar)
- [Chat with Multiple `PDFs` | LangChain App Tutorial in Python (Free LLMs and Embeddings)](https://youtu.be/dXxQ0LR-3Hg) by [Alejandro AO - Software & Ai](https://www.youtube.com/@alejandro_ao)
- [Chat with a `CSV` | `LangChain Agents` Tutorial (Beginners)](https://youtu.be/tjeti5vXWOU) by [Alejandro AO - Software & Ai](https://www.youtube.com/@alejandro_ao)
- [Create Your Own ChatGPT with `PDF` Data in 5 Minutes (LangChain Tutorial)](https://youtu.be/au2WVVGUvc8) by [Liam Ottley](https://www.youtube.com/@LiamOttley)
- [Using ChatGPT with YOUR OWN Data. This is magical. (LangChain OpenAI API)](https://youtu.be/9AXP7tCI9PI) by [TechLead](https://www.youtube.com/@TechLead)
- [Build a Custom Chatbot with OpenAI: `GPT-Index` & LangChain | Step-by-Step Tutorial](https://youtu.be/FIDv6nc4CgU) by [Fabrikod](https://www.youtube.com/@fabrikod)
- [`Flowise` is an open source no-code UI visual tool to build 🦜🔗LangChain applications](https://youtu.be/CovAPtQPU0k) by [Cobus Greyling](https://www.youtube.com/@CobusGreylingZA)
- [LangChain & GPT 4 For Data Analysis: The `Pandas` Dataframe Agent](https://youtu.be/rFQ5Kmkd4jc) by [Rabbitmetrics](https://www.youtube.com/@rabbitmetrics)
- [`GirlfriendGPT` - AI girlfriend with LangChain](https://youtu.be/LiN3D1QZGQw) by [Toolfinder AI](https://www.youtube.com/@toolfinderai)
- [`PrivateGPT`: Chat to your FILES OFFLINE and FREE [Installation and Tutorial]](https://youtu.be/G7iLllmx4qc) by [Prompt Engineering](https://www.youtube.com/@engineerprompt)
- [How to build with Langchain 10x easier | ⛓️ LangFlow & `Flowise`](https://youtu.be/Ya1oGL7ZTvU) by [AI Jason](https://www.youtube.com/@AIJasonZ)
- [Getting Started With LangChain In 20 Minutes- Build Celebrity Search Application](https://youtu.be/_FpT1cwcSLg) by [Krish Naik](https://www.youtube.com/@krishnaik06)
- [Build AI chatbot with custom knowledge base using OpenAI API and GPT Index](https://youtu.be/vDZAZuaXf48) by [Irina Nik](https://www.youtube.com/@irina_nik)
- [Build Your Own Auto-GPT Apps with LangChain (Python Tutorial)](https://youtu.be/NYSWn1ipbgg) by [Dave Ebbelaar](https://www.youtube.com/@daveebbelaar)
- [Chat with Multiple `PDFs` | LangChain App Tutorial in Python (Free LLMs and Embeddings)](https://youtu.be/dXxQ0LR-3Hg) by [Alejandro AO - Software & Ai](https://www.youtube.com/@alejandro_ao)
- [Chat with a `CSV` | `LangChain Agents` Tutorial (Beginners)](https://youtu.be/tjeti5vXWOU) by [Alejandro AO - Software & Ai](https://www.youtube.com/@alejandro_ao)
- [Create Your Own ChatGPT with `PDF` Data in 5 Minutes (LangChain Tutorial)](https://youtu.be/au2WVVGUvc8) by [Liam Ottley](https://www.youtube.com/@LiamOttley)
- [Using ChatGPT with YOUR OWN Data. This is magical. (LangChain OpenAI API)](https://youtu.be/9AXP7tCI9PI) by [TechLead](https://www.youtube.com/@TechLead)
- [Build a Custom Chatbot with OpenAI: `GPT-Index` & LangChain | Step-by-Step Tutorial](https://youtu.be/FIDv6nc4CgU) by [Fabrikod](https://www.youtube.com/@fabrikod)
- [`Flowise` is an open source no-code UI visual tool to build 🦜🔗LangChain applications](https://youtu.be/CovAPtQPU0k) by [Cobus Greyling](https://www.youtube.com/@CobusGreylingZA)
- [LangChain & GPT 4 For Data Analysis: The `Pandas` Dataframe Agent](https://youtu.be/rFQ5Kmkd4jc) by [Rabbitmetrics](https://www.youtube.com/@rabbitmetrics)
- [`GirlfriendGPT` - AI girlfriend with LangChain](https://youtu.be/LiN3D1QZGQw) by [Toolfinder AI](https://www.youtube.com/@toolfinderai)
- [`PrivateGPT`: Chat to your FILES OFFLINE and FREE [Installation and Tutorial]](https://youtu.be/G7iLllmx4qc) by [Prompt Engineering](https://www.youtube.com/@engineerprompt)
- [How to build with Langchain 10x easier | ⛓️ LangFlow & `Flowise`](https://youtu.be/Ya1oGL7ZTvU) by [AI Jason](https://www.youtube.com/@AIJasonZ)
- [Getting Started With LangChain In 20 Minutes- Build Celebrity Search Application](https://youtu.be/_FpT1cwcSLg) by [Krish Naik](https://www.youtube.com/@krishnaik06)
- ⛓ [LangChain HowTo and Guides YouTube playlist](https://www.youtube.com/playlist?list=PL8motc6AQftk1Bs42EW45kwYbyJ4jOdiZ) by [Sam Witteveen](https://www.youtube.com/@samwitteveenai/)
### [Prompt Engineering and LangChain](https://www.youtube.com/watch?v=muXbPpG_ys4&list=PLEJK-H61Xlwzm5FYLDdKt_6yibO33zoMW) by [Venelin Valkov](https://www.youtube.com/@venelin_valkov)

File diff suppressed because it is too large Load Diff

@ -2,7 +2,7 @@
If you're building with LLMs, at some point something will break, and you'll need to debug. A model call will fail, or the model output will be misformatted, or there will be some nested model calls and it won't be clear where along the way an incorrect output was created.
Here's a few different tools and functionalities to aid in debugging.
Here are a few different tools and functionalities to aid in debugging.
@ -18,9 +18,9 @@ For anyone building production-grade LLM applications, we highly recommend using
If you're prototyping in Jupyter Notebooks or running Python scripts, it can be helpful to print out the intermediate steps of a Chain run.
There's a number of ways to enable printing at varying degrees of verbosity.
There are a number of ways to enable printing at varying degrees of verbosity.
Let's suppose we have a simple agent and want to visualize the actions it takes and tool outputs it receives. Without any debugging, here's what we see:
Let's suppose we have a simple agent, and want to visualize the actions it takes and tool outputs it receives. Without any debugging, here's what we see:
```python

@ -28,7 +28,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@ -47,16 +47,16 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'My name is Mrs. Rachel Chen DDS, call me at 849-829-7628x073 or email me at christopherfrey@example.org'"
"'My name is Laura Ruiz, call me at +1-412-982-8374x13414 or email me at javierwatkins@example.net'"
]
},
"execution_count": 14,
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
@ -82,7 +82,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@ -94,35 +94,53 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"text = f\"\"\"Slim Shady recently lost his wallet. \n",
"Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n",
"If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"AIMessage(content='You can find our super secret data at https://www.ross.com/', additional_kwargs={}, example=False)"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"Dear Sir/Madam,\n",
"\n",
"We regret to inform you that Richard Fields has recently misplaced his wallet, which contains a sum of cash and his credit card bearing the number 30479847307774. \n",
"\n",
"Should you happen to come across it, we kindly request that you contact us immediately at 6439182672 or via email at frank45@example.com.\n",
"\n",
"Thank you for your attention to this matter.\n",
"\n",
"Yours faithfully,\n",
"\n",
"[Your Name]\n"
]
}
],
"source": [
"from langchain.prompts.prompt import PromptTemplate\n",
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.schema.runnable import RunnablePassthrough\n",
"\n",
"template = \"\"\"According to this text, where can you find our super secret data?\n",
"anonymizer = PresidioAnonymizer()\n",
"\n",
"{anonymized_text}\n",
"template = \"\"\"Rewrite this text into an official, short email:\n",
"\n",
"Answer:\"\"\"\n",
"{anonymized_text}\"\"\"\n",
"prompt = PromptTemplate.from_template(template)\n",
"llm = ChatOpenAI()\n",
"llm = ChatOpenAI(temperature=0)\n",
"\n",
"chain = {\"anonymized_text\": anonymizer.anonymize} | prompt | llm\n",
"chain.invoke(\"You can find our super secret data at https://supersecretdata.com\")"
"response = chain.invoke(text)\n",
"print(response.content)"
]
},
{
@ -135,16 +153,16 @@
},
{
"cell_type": "code",
"execution_count": 18,
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'My name is Gabrielle Edwards, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'"
"'My name is Adrian Fleming, call me at 313-666-7440 or email me at real.slim.shady@gmail.com'"
]
},
"execution_count": 18,
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
@ -166,16 +184,16 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'My name is Victoria Mckinney, call me at 713-549-8623 or email me at real.slim.shady@gmail.com'"
"'My name is Justin Miller, call me at 761-824-1889 or email me at real.slim.shady@gmail.com'"
]
},
"execution_count": 3,
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
@ -201,16 +219,16 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'My name is Billy Russo, call me at 970-996-9453x038 or email me at jamie80@example.org'"
"'My name is Dr. Jennifer Baker, call me at (508)839-9329x232 or email me at ehamilton@example.com'"
]
},
"execution_count": 4,
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
@ -232,16 +250,16 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'My polish phone number is EVIA70648911396944'"
"'My polish phone number is NRGN41434238921378'"
]
},
"execution_count": 5,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@ -261,7 +279,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
@ -291,7 +309,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
@ -308,7 +326,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 12,
"metadata": {},
"outputs": [
{
@ -337,16 +355,16 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'+48 533 220 543'"
"'511 622 683'"
]
},
"execution_count": 9,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
@ -374,7 +392,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
@ -389,7 +407,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
@ -398,16 +416,16 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'My polish phone number is +48 692 715 636'"
"'My polish phone number is +48 734 630 977'"
]
},
"execution_count": 12,
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
@ -443,7 +461,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
"version": "3.11.4"
}
},
"nbformat": 4,

@ -0,0 +1,461 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Reversible data anonymization with Microsoft Presidio\n",
"\n",
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/guides/privacy/presidio_reversible_anonymization.ipynb)\n",
"\n",
"\n",
"## Use case\n",
"\n",
"We have already written about the importance of anonymizing sensitive data in the previous section. **Reversible Anonymization** is an equally essential technology while sharing information with language models, as it balances data protection with data usability. This technique involves masking sensitive personally identifiable information (PII), yet it can be reversed and original data can be restored when authorized users need it. Its main advantage lies in the fact that while it conceals individual identities to prevent misuse, it also allows the concealed data to be accurately unmasked should it be necessary for legal or compliance purposes. \n",
"\n",
"## Overview\n",
"\n",
"We implemented the `PresidioReversibleAnonymizer`, which consists of two parts:\n",
"\n",
"1. anonymization - it works the same way as `PresidioAnonymizer`, plus the object itself stores a mapping of made-up values to original ones, for example:\n",
"```\n",
" {\n",
" \"PERSON\": {\n",
" \"<anonymized>\": \"<original>\",\n",
" \"John Doe\": \"Slim Shady\"\n",
" },\n",
" \"PHONE_NUMBER\": {\n",
" \"111-111-1111\": \"555-555-5555\"\n",
" }\n",
" ...\n",
" }\n",
"```\n",
"\n",
"2. deanonymization - using the mapping described above, it matches fake data with original data and then substitutes it.\n",
"\n",
"Between anonymization and deanonymization user can perform different operations, for example, passing the output to LLM.\n",
"\n",
"## Quickstart\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# Install necessary packages\n",
"# ! pip install langchain langchain-experimental openai presidio-analyzer presidio-anonymizer spacy Faker\n",
"# ! python -m spacy download en_core_web_lg"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`PresidioReversibleAnonymizer` is not significantly different from its predecessor (`PresidioAnonymizer`) in terms of anonymization:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'My name is Maria Lynch, call me at 7344131647 or email me at jamesmichael@example.com. By the way, my card number is: 4838637940262'"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n",
"\n",
"anonymizer = PresidioReversibleAnonymizer(\n",
" analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n",
" # Faker seed is used here to make sure the same fake data is generated for the test purposes\n",
" # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n",
" faker_seed=42,\n",
")\n",
"\n",
"anonymizer.anonymize(\n",
" \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n",
" \"By the way, my card number is: 4916 0387 9536 0861\"\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This is what the full string we want to deanonymize looks like:"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Maria Lynch recently lost his wallet. \n",
"Inside is some cash and his credit card with the number 4838637940262. \n",
"If you would find it, please call at 7344131647 or write an email here: jamesmichael@example.com.\n",
"Maria Lynch would be very grateful!\n"
]
}
],
"source": [
"# We know this data, as we set the faker_seed parameter\n",
"fake_name = \"Maria Lynch\"\n",
"fake_phone = \"7344131647\"\n",
"fake_email = \"jamesmichael@example.com\"\n",
"fake_credit_card = \"4838637940262\"\n",
"\n",
"anonymized_text = f\"\"\"{fake_name} recently lost his wallet. \n",
"Inside is some cash and his credit card with the number {fake_credit_card}. \n",
"If you would find it, please call at {fake_phone} or write an email here: {fake_email}.\n",
"{fake_name} would be very grateful!\"\"\"\n",
"\n",
"print(anonymized_text)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"And now, using the `deanonymize` method, we can reverse the process:"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Slim Shady recently lost his wallet. \n",
"Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n",
"If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\n",
"Slim Shady would be very grateful!\n"
]
}
],
"source": [
"print(anonymizer.deanonymize(anonymized_text))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Using with LangChain Expression Language\n",
"\n",
"With LCEL we can easily chain together anonymization and deanonymization with the rest of our application. This is an example of using the anonymization mechanism with a query to LLM (without deanonymization for now):"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"text = f\"\"\"Slim Shady recently lost his wallet. \n",
"Inside is some cash and his credit card with the number 4916 0387 9536 0861. \n",
"If you would find it, please call at 313-666-7440 or write an email here: real.slim.shady@gmail.com.\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dear Sir/Madam,\n",
"\n",
"We regret to inform you that Mr. Dana Rhodes has reported the loss of his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4397528473885757. \n",
"\n",
"If you happen to come across the aforementioned wallet, we kindly request that you contact us immediately at 258-481-7074x714 or via email at laurengoodman@example.com.\n",
"\n",
"Your prompt assistance in this matter would be greatly appreciated.\n",
"\n",
"Yours faithfully,\n",
"\n",
"[Your Name]\n"
]
}
],
"source": [
"from langchain.prompts.prompt import PromptTemplate\n",
"from langchain.chat_models import ChatOpenAI\n",
"\n",
"anonymizer = PresidioReversibleAnonymizer()\n",
"\n",
"template = \"\"\"Rewrite this text into an official, short email:\n",
"\n",
"{anonymized_text}\"\"\"\n",
"prompt = PromptTemplate.from_template(template)\n",
"llm = ChatOpenAI(temperature=0)\n",
"\n",
"chain = {\"anonymized_text\": anonymizer.anonymize} | prompt | llm\n",
"response = chain.invoke(text)\n",
"print(response.content)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now, let's add **deanonymization step** to our sequence:"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Dear Sir/Madam,\n",
"\n",
"We regret to inform you that Mr. Slim Shady has recently misplaced his wallet. The wallet contains a sum of cash and his credit card, bearing the number 4916 0387 9536 0861. \n",
"\n",
"If by any chance you come across the lost wallet, kindly contact us immediately at 313-666-7440 or send an email to real.slim.shady@gmail.com.\n",
"\n",
"Your prompt assistance in this matter would be greatly appreciated.\n",
"\n",
"Yours faithfully,\n",
"\n",
"[Your Name]\n"
]
}
],
"source": [
"chain = chain | (lambda ai_message: anonymizer.deanonymize(ai_message.content))\n",
"response = chain.invoke(text)\n",
"print(response)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Anonymized data was given to the model itself, and therefore it was protected from being leaked to the outside world. Then, the model's response was processed, and the factual value was replaced with the real one."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Extra knowledge"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`PresidioReversibleAnonymizer` stores the mapping of the fake values to the original values in the `deanonymizer_mapping` parameter, where key is fake PII and value is the original one: "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'PERSON': {'Maria Lynch': 'Slim Shady'},\n",
" 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n",
" 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n",
" 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861'}}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer\n",
"\n",
"anonymizer = PresidioReversibleAnonymizer(\n",
" analyzed_fields=[\"PERSON\", \"PHONE_NUMBER\", \"EMAIL_ADDRESS\", \"CREDIT_CARD\"],\n",
" # Faker seed is used here to make sure the same fake data is generated for the test purposes\n",
" # In production, it is recommended to remove the faker_seed parameter (it will default to None)\n",
" faker_seed=42,\n",
")\n",
"\n",
"anonymizer.anonymize(\n",
" \"My name is Slim Shady, call me at 313-666-7440 or email me at real.slim.shady@gmail.com. \"\n",
" \"By the way, my card number is: 4916 0387 9536 0861\"\n",
")\n",
"\n",
"anonymizer.deanonymizer_mapping"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Anonymizing more texts will result in new mapping entries:"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Do you have his VISA card number? Yep, it's 3537672423884966. I'm William Bowman by the way.\n"
]
},
{
"data": {
"text/plain": [
"{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n",
" 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n",
" 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n",
" 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n",
" '3537672423884966': '4001 9192 5753 7193'}}"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"print(\n",
" anonymizer.anonymize(\n",
" \"Do you have his VISA card number? Yep, it's 4001 9192 5753 7193. I'm John Doe by the way.\"\n",
" )\n",
")\n",
"\n",
"anonymizer.deanonymizer_mapping"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can save the mapping itself to a file for future use: "
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# We can save the deanonymizer mapping as a JSON or YAML file\n",
"\n",
"anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n",
"# anonymizer.save_deanonymizer_mapping(\"deanonymizer_mapping.yaml\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"And then, load it in another `PresidioReversibleAnonymizer` instance:"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{}"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"anonymizer = PresidioReversibleAnonymizer()\n",
"\n",
"anonymizer.deanonymizer_mapping"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'PERSON': {'Maria Lynch': 'Slim Shady', 'William Bowman': 'John Doe'},\n",
" 'PHONE_NUMBER': {'7344131647': '313-666-7440'},\n",
" 'EMAIL_ADDRESS': {'jamesmichael@example.com': 'real.slim.shady@gmail.com'},\n",
" 'CREDIT_CARD': {'4838637940262': '4916 0387 9536 0861',\n",
" '3537672423884966': '4001 9192 5753 7193'}}"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"anonymizer.load_deanonymizer_mapping(\"deanonymizer_mapping.json\")\n",
"\n",
"anonymizer.deanonymizer_mapping"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Future works\n",
"\n",
"- **instance anonymization** - at this point, each occurrence of PII is treated as a separate entity and separately anonymized. Therefore, two occurrences of the name John Doe in the text will be changed to two different names. It is therefore worth introducing support for full instance detection, so that repeated occurrences are treated as a single object.\n",
"- **better matching and substitution of fake values for real ones** - currently the strategy is based on matching full strings and then substituting them. Due to the indeterminism of language models, it may happen that the value in the answer is slightly changed (e.g. *John Doe* -> *John* or *Main St, New York* -> *New York*) and such a substitution is then no longer possible. Therefore, it is worth adjusting the matching for your needs."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

@ -512,9 +512,9 @@
"# Examples\n",
"---\n",
"\n",
"## With HuggingFace Hub Models\n",
"## With Hugging Face Hub Models\n",
"\n",
"Get your API Key from Huggingface hub - https://huggingface.co/docs/api-inference/quicktour#get-your-api-token"
"Get your API Key from Hugging Face hub - https://huggingface.co/docs/api-inference/quicktour#get-your-api-token"
]
},
{

@ -18,7 +18,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
@ -93,8 +93,22 @@
}
],
"metadata": {
"kernelspec": {
"display_name": "langchain",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.5"
},
"orig_nbformat": 4
},

@ -31,11 +31,16 @@
"outputs": [],
"source": [
"# get new tokens: https://app.banana.dev/\n",
"# We need two tokens, not just an `api_key`: `BANANA_API_KEY` and `YOUR_MODEL_KEY`\n",
"# We need three parameters to make a Banana.dev API call:\n",
"# * a team api key\n",
"# * the model's unique key\n",
"# * the model's url slug\n",
"\n",
"import os\n",
"from getpass import getpass\n",
"\n",
"# You can get this from the main dashboard\n",
"# at https://app.banana.dev\n",
"os.environ[\"BANANA_API_KEY\"] = \"YOUR_API_KEY\"\n",
"# OR\n",
"# BANANA_API_KEY = getpass()"
@ -70,7 +75,9 @@
"metadata": {},
"outputs": [],
"source": [
"llm = Banana(model_key=\"YOUR_MODEL_KEY\")"
"# Both of these are found in your model's \n",
"# detail page in https://app.banana.dev\n",
"llm = Banana(model_key=\"YOUR_MODEL_KEY\", model_url_slug=\"YOUR_MODEL_URL_SLUG\")"
]
},
{

@ -236,7 +236,7 @@
"metadata": {},
"outputs": [],
"source": [
"llm_oss = VertexAIModelGarden(\n",
"llm = VertexAIModelGarden(\n",
" project=\"YOUR PROJECT\",\n",
" endpoint_id=\"YOUR ENDPOINT_ID\"\n",
")"
@ -248,14 +248,25 @@
"metadata": {},
"outputs": [],
"source": [
"llm_oss(\"What is the meaning of life?\")"
"llm(\"What is the meaning of life?\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"You can also use it as a chain:"
"Like all LLMs, we can then compose it with other components:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from langchain.prompts import PromptTemplate\n",
"\n",
"prompt = PromptTemplate.from_template(\"What is the meaning of {thing}?\")"
]
},
{
@ -264,17 +275,17 @@
"metadata": {},
"outputs": [],
"source": [
"llm_oss_chain = LLMChain(prompt=prompt, llm=llm_oss(\"What is the meaning of life?\")\n",
")\n",
"llm_oss_chain.run(question)"
"llm_oss_chain = prompt | llm\n",
"\n",
"llm_oss_chain.invoke({\"thing\": \"life\"})"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "poetry-venv",
"language": "python",
"name": "python3"
"name": "poetry-venv"
},
"language_info": {
"codemirror_mode": {
@ -286,7 +297,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
"version": "3.9.1"
},
"vscode": {
"interpreter": {

@ -1,79 +1,72 @@
# Banana
This page covers how to use the Banana ecosystem within LangChain.
It is broken into two parts: installation and setup, and then references to specific Banana wrappers.
Banana provided serverless GPU inference for AI models, including a CI/CD build pipeline and a simple Python framework (Potassium) to server your models.
This page covers how to use the [Banana](https://www.banana.dev) ecosystem within LangChain.
It is broken into two parts:
* installation and setup,
* and then references to specific Banana wrappers.
## Installation and Setup
- Install with `pip install banana-dev`
- Get an Banana api key and set it as an environment variable (`BANANA_API_KEY`)
- Get an Banana api key from the [Banana.dev dashboard](https://app.banana.dev) and set it as an environment variable (`BANANA_API_KEY`)
- Get your model's key and url slug from the model's details page
## Define your Banana Template
If you want to use an available language model template you can find one [here](https://app.banana.dev/templates/conceptofmind/serverless-template-palmyra-base).
This template uses the Palmyra-Base model by [Writer](https://writer.com/product/api/).
You can check out an example Banana repository [here](https://github.com/conceptofmind/serverless-template-palmyra-base).
You'll need to set up a Github repo for your Banana app. You can get started in 5 minutes using [this guide](https://docs.banana.dev/banana-docs/).
Alternatively, for a ready-to-go LLM example, you can check out Banana's [CodeLlama-7B-Instruct-GPTQ](https://github.com/bananaml/demo-codellama-7b-instruct-gptq) GitHub repository. Just fork it and deploy it within Banana.
Other starter repos are available [here](https://github.com/orgs/bananaml/repositories?q=demo-&type=all&language=&sort=).
## Build the Banana app
Banana Apps must include the "output" key in the return json.
There is a rigid response structure.
To use Banana apps within Langchain, they must include the `outputs` key
in the returned json, and the value must be a string.
```python
# Return the results as a dictionary
result = {'output': result}
result = {'outputs': result}
```
An example inference function would be:
```python
def inference(model_inputs:dict) -> dict:
global model
global tokenizer
# Parse out your arguments
prompt = model_inputs.get('prompt', None)
if prompt == None:
return {'message': "No prompt provided"}
# Run the model
input_ids = tokenizer.encode(prompt, return_tensors='pt').cuda()
output = model.generate(
input_ids,
max_length=100,
do_sample=True,
top_k=50,
top_p=0.95,
num_return_sequences=1,
temperature=0.9,
early_stopping=True,
no_repeat_ngram_size=3,
num_beams=5,
length_penalty=1.5,
repetition_penalty=1.5,
bad_words_ids=[[tokenizer.encode(' ', add_prefix_space=True)[0]]]
)
result = tokenizer.decode(output[0], skip_special_tokens=True)
# Return the results as a dictionary
result = {'output': result}
return result
@app.handler("/")
def handler(context: dict, request: Request) -> Response:
"""Handle a request to generate code from a prompt."""
model = context.get("model")
tokenizer = context.get("tokenizer")
max_new_tokens = request.json.get("max_new_tokens", 512)
temperature = request.json.get("temperature", 0.7)
prompt = request.json.get("prompt")
prompt_template=f'''[INST] Write code to solve the following coding problem that obeys the constraints and passes the example test cases. Please wrap your code answer using ```:
{prompt}
[/INST]
'''
input_ids = tokenizer(prompt_template, return_tensors='pt').input_ids.cuda()
output = model.generate(inputs=input_ids, temperature=temperature, max_new_tokens=max_new_tokens)
result = tokenizer.decode(output[0])
return Response(json={"outputs": result}, status=200)
```
You can find a full example of a Banana app [here](https://github.com/conceptofmind/serverless-template-palmyra-base/blob/main/app.py).
This example is from the `app.py` file in [CodeLlama-7B-Instruct-GPTQ](https://github.com/bananaml/demo-codellama-7b-instruct-gptq).
## Wrappers
### LLM
There exists an Banana LLM wrapper, which you can access with
Within Langchain, there exists a Banana LLM wrapper, which you can access with
```python
from langchain.llms import Banana
```
You need to provide a model key located in the dashboard:
You need to provide a model key and model url slug, which you can get from the model's details page in the [Banana.dev dashboard](https://app.banana.dev).
```python
llm = Banana(model_key="YOUR_MODEL_KEY")
llm = Banana(model_key="YOUR_MODEL_KEY", model_url_slug="YOUR_MODEL_URL_SLUG")
```

@ -5,13 +5,23 @@
"id": "ed47bb62",
"metadata": {},
"source": [
"# Hugging Face Hub\n",
"# Hugging Face\n",
"Let's load the Hugging Face Embedding class."
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": null,
"id": "16b20335-da1d-46ba-aa23-fbf3e2c6fe60",
"metadata": {},
"outputs": [],
"source": [
"!pip install langchain sentence_transformers"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "861521a9",
"metadata": {},
"outputs": [],
@ -21,7 +31,7 @@
},
{
"cell_type": "code",
"execution_count": 16,
"execution_count": 3,
"id": "ff9be586",
"metadata": {},
"outputs": [],
@ -31,7 +41,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 3,
"id": "d0a98ae9",
"metadata": {},
"outputs": [],
@ -41,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 5,
"id": "5d6c682b",
"metadata": {},
"outputs": [],
@ -51,7 +61,28 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 6,
"id": "b57b8ce9-ef7d-4e63-979e-aa8763d1f9a8",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[-0.04895168915390968, -0.03986193612217903, -0.021562768146395683]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query_result[:3]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "bb5e74c0",
"metadata": {},
"outputs": [],
@ -59,20 +90,72 @@
"doc_result = embeddings.embed_documents([text])"
]
},
{
"cell_type": "markdown",
"id": "92019ef1-5d30-4985-b4e6-c0d98bdfe265",
"metadata": {},
"source": [
"## Hugging Face Inference API\n",
"We can also access embedding models via the Hugging Face Inference API, which does not require us to install ``sentence_transformers`` and download models locally."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "aaad49f8",
"execution_count": 1,
"id": "66f5c6ba-1446-43e1-b012-800d17cef300",
"metadata": {},
"outputs": [],
"source": []
"outputs": [
{
"name": "stdin",
"output_type": "stream",
"text": [
"Enter your HF Inference API Key:\n",
"\n",
" ········\n"
]
}
],
"source": [
"import getpass\n",
"\n",
"inference_api_key = getpass.getpass(\"Enter your HF Inference API Key:\\n\\n\")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "d0623c1f-cd82-4862-9bce-3655cb9b66ac",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[-0.038338541984558105, 0.1234646737575531, -0.028642963618040085]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings\n",
"\n",
"embeddings = HuggingFaceInferenceAPIEmbeddings(\n",
" api_key=inference_api_key,\n",
" model_name=\"sentence-transformers/all-MiniLM-l6-v2\"\n",
")\n",
"\n",
"query_result = embeddings.embed_query(text)\n",
"query_result[:3]"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"display_name": "poetry-venv",
"language": "python",
"name": "python3"
"name": "poetry-venv"
},
"language_info": {
"codemirror_mode": {

@ -0,0 +1,126 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# NucliaDB\n",
"\n",
"You can use a local NucliaDB instance or use [Nuclia Cloud](https://nuclia.cloud).\n",
"\n",
"When using a local instance, you need a Nuclia Understanding API key, so your texts are properly vectorized and indexed. You can get a key by creating a free account at [https://nuclia.cloud](https://nuclia.cloud), and then [create a NUA key](https://docs.nuclia.dev/docs/docs/using/understanding/intro)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#!pip install langchain nuclia"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Usage with nuclia.cloud"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain.vectorstores.nucliadb import NucliaDB\n",
"API_KEY = \"YOUR_API_KEY\"\n",
"\n",
"ndb = NucliaDB(knowledge_box=\"YOUR_KB_ID\", local=False, api_key=API_KEY)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Usage with a local instance\n",
"\n",
"Note: By default `backend` is set to `http://localhost:8080`."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain.vectorstores.nucliadb import NucliaDB\n",
"\n",
"ndb = NucliaDB(knowledge_box=\"YOUR_KB_ID\", local=True, backend=\"http://my-local-server\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Add and delete texts to your Knowledge Box"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ids = ndb.add_texts([\"This is a new test\", \"This is a second test\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"ndb.delete(ids=ids)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Search in your Knowledge Box"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"results = ndb.similarity_search(\"Who was inspired by Ada Lovelace?\")\n",
"print(res.page_content)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

@ -0,0 +1,207 @@
{
"cells": [
{
"cell_type": "markdown",
"source": [
"# sqlite-vss\n",
"\n",
">[sqlite-vss](https://alexgarcia.xyz/sqlite-vss/) is an SQLite extension designed for vector search, emphasizing local-first operations and easy integration into applications without external servers. Leveraging the Faiss library, it offers efficient similarity search and clustering capabilities.\n",
"\n",
"This notebook shows how to use the `SQLiteVSS` vector database."
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [
"# You need to install sqlite-vss as a dependency.\n",
"%pip install sqlite-vss"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "markdown",
"source": [
"### Quickstart"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 2,
"outputs": [
{
"data": {
"text/plain": "'Tonight. I call on the Senate to: Pass the Freedom to Vote Act. Pass the John Lewis Voting Rights Act. And while youre at it, pass the Disclose Act so Americans can know who is funding our elections. \\n\\nTonight, Id like to honor someone who has dedicated his life to serve this country: Justice Stephen Breyer—an Army veteran, Constitutional scholar, and retiring Justice of the United States Supreme Court. Justice Breyer, thank you for your service. \\n\\nOne of the most serious constitutional responsibilities a President has is nominating someone to serve on the United States Supreme Court. \\n\\nAnd I did that 4 days ago, when I nominated Circuit Court of Appeals Judge Ketanji Brown Jackson. One of our nations top legal minds, who will continue Justice Breyers legacy of excellence.'"
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores import SQLiteVSS\n",
"from langchain.document_loaders import TextLoader\n",
"\n",
"# load the document and split it into chunks\n",
"loader = TextLoader(\"../../../state_of_the_union.txt\")\n",
"documents = loader.load()\n",
"\n",
"# split it into chunks\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"docs = text_splitter.split_documents(documents)\n",
"texts = [doc.page_content for doc in docs]\n",
"\n",
"\n",
"# create the open-source embedding function\n",
"embedding_function = SentenceTransformerEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n",
"\n",
"\n",
"# load it in sqlite-vss in a table named state_union.\n",
"# the db_file parameter is the name of the file you want\n",
"# as your sqlite database.\n",
"db = SQLiteVSS.from_texts(\n",
" texts=texts,\n",
" embedding=embedding_function,\n",
" table=\"state_union\",\n",
" db_file=\"/tmp/vss.db\"\n",
")\n",
"\n",
"# query it\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"data = db.similarity_search(query)\n",
"\n",
"# print results\n",
"data[0].page_content"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-09-06T14:55:55.370351Z",
"start_time": "2023-09-06T14:55:53.547755Z"
}
}
},
{
"cell_type": "markdown",
"source": [
"### Using existing sqlite connection"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 7,
"outputs": [
{
"data": {
"text/plain": "'Ketanji Brown Jackson is awesome'"
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings\n",
"from langchain.text_splitter import CharacterTextSplitter\n",
"from langchain.vectorstores import SQLiteVSS\n",
"from langchain.document_loaders import TextLoader\n",
"\n",
"# load the document and split it into chunks\n",
"loader = TextLoader(\"../../../state_of_the_union.txt\")\n",
"documents = loader.load()\n",
"\n",
"# split it into chunks\n",
"text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)\n",
"docs = text_splitter.split_documents(documents)\n",
"texts = [doc.page_content for doc in docs]\n",
"\n",
"\n",
"# create the open-source embedding function\n",
"embedding_function = SentenceTransformerEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n",
"connection = SQLiteVSS.create_connection(db_file=\"/tmp/vss.db\")\n",
"\n",
"db1 = SQLiteVSS(\n",
" table=\"state_union\",\n",
" embedding=embedding_function,\n",
" connection=connection\n",
")\n",
"\n",
"db1.add_texts([\"Ketanji Brown Jackson is awesome\"])\n",
"# query it again\n",
"query = \"What did the president say about Ketanji Brown Jackson\"\n",
"data = db1.similarity_search(query)\n",
"\n",
"# print results\n",
"data[0].page_content"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-09-06T14:59:22.086252Z",
"start_time": "2023-09-06T14:59:21.693237Z"
}
}
},
{
"cell_type": "code",
"execution_count": 13,
"outputs": [],
"source": [
"# Cleaning up\n",
"import os\n",
"os.remove(\"/tmp/vss.db\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-09-06T15:01:15.550318Z",
"start_time": "2023-09-06T15:01:15.546428Z"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

@ -167,7 +167,7 @@
"Tables necessary to determine the places of the planets are not less\r\n",
"necessary than those for the sun, moon, and stars. Some notion of the\r\n",
"number and complexity of these tables may be formed, when we state that\r\n",
"the positions of the two principal planets, (and these the most\r\n",
"the positions of the two principal planets, (and these are the most\r\n",
"necessary for the navigator,) Jupiter and Saturn, require each not less\r\n",
"than one hundred and sixteen tables. Yet it is not only necessary to\r\n",
"predict the position of these bodies, but it is likewise expedient to -> 0.8998482592744614 \n",

@ -1,12 +1,21 @@
{
"cells": [
{
"cell_type": "raw",
"id": "ea5c61b2-8b52-4270-bdb0-c4df88608f15",
"metadata": {},
"source": [
"---\n",
"sidebar_position: 1\n",
"title: Interacting with APIs\n",
"---"
]
},
{
"cell_type": "markdown",
"id": "a15e6a18",
"metadata": {},
"source": [
"# Interacting with APIs\n",
"\n",
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/apis.ipynb)\n",
"\n",
"## Use case \n",
@ -69,9 +78,7 @@
"cell_type": "code",
"execution_count": 2,
"id": "30b780e3",
"metadata": {
"scrolled": false
},
"metadata": {},
"outputs": [
{
"name": "stderr",
@ -415,7 +422,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
"version": "3.9.1"
}
},
"nbformat": 4,

@ -1,12 +1,21 @@
{
"cells": [
{
"cell_type": "raw",
"id": "22fd28c9-9b48-476c-bca8-20efef7fdb14",
"metadata": {},
"source": [
"---\n",
"sidebar_position: 1\n",
"title: Chatbots\n",
"---"
]
},
{
"cell_type": "markdown",
"id": "ee7f95e4",
"metadata": {},
"source": [
"# Chatbots\n",
"\n",
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/chatbots.ipynb)\n",
"\n",
"## Use case\n",

@ -1,11 +1,19 @@
{
"cells": [
{
"cell_type": "raw",
"metadata": {},
"source": [
"---\n",
"sidebar_position: 1\n",
"title: Code understanding\n",
"---"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Code Understanding\n",
"\n",
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/code_understanding.ipynb)\n",
"\n",
"## Use case\n",
@ -1047,7 +1055,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
"version": "3.9.1"
}
},
"nbformat": 4,

@ -1,12 +1,21 @@
{
"cells": [
{
"cell_type": "raw",
"id": "df29b30a-fd27-4e08-8269-870df5631f9e",
"metadata": {},
"source": [
"---\n",
"sidebar_position: 1\n",
"title: Extraction\n",
"---"
]
},
{
"cell_type": "markdown",
"id": "b84edb4e",
"metadata": {},
"source": [
"# Extraction\n",
"\n",
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/extraction.ipynb)\n",
"\n",
"## Use case\n",
@ -589,7 +598,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
"version": "3.9.1"
}
},
"nbformat": 4,

@ -1,2 +1,2 @@
label: 'More'
position: 1
position: 2

@ -584,7 +584,7 @@
"\n",
"Collectivly, this tells us: carefully inspect Agent traces and tool outputs. \n",
"\n",
"As we saw with the [SQL use case](/docs/use_cases/sql), `ReAct agents` can be work very well for specific problems. \n",
"As we saw with the [SQL use case](/docs/use_cases/qa_structured/sql), `ReAct agents` can be work very well for specific problems. \n",
"\n",
"But, as shown here, the result is degraded relative to what we see with the OpenAI agent."
]

@ -1,7 +1,3 @@
---
sidebar_position: 0
---
# Code writing
:::warning

@ -0,0 +1,307 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "7f0b0c06-ee70-468c-8bf5-b023f9e5e0a2",
"metadata": {},
"source": [
"# Diffbot Graph Transformer\n",
"\n",
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/more/graph/diffbot_transformer.ipynb)\n",
"\n",
"## Use case\n",
"\n",
"Text data often contain rich relationships and insights that can be useful for various analytics, recommendation engines, or knowledge management applications.\n",
"\n",
"Diffbot's NLP API allows for the extraction of entities, relationships, and semantic meaning from unstructured text data.\n",
"\n",
"By coupling Diffbot's NLP API with Neo4j, a graph database, you can create powerful, dynamic graph structures based on the information extracted from text. These graph structures are fully queryable and can be integrated into various applications.\n",
"\n",
"This combination allows for use cases such as:\n",
"\n",
"* Building knowledge graphs from textual documents, websites, or social media feeds.\n",
"* Generating recommendations based on semantic relationships in the data.\n",
"* Creating advanced search features that understand the relationships between entities.\n",
"* Building analytics dashboards that allow users to explore the hidden relationships in data.\n",
"\n",
"## Overview\n",
"\n",
"LangChain provides tools to interact with Graph Databases:\n",
"\n",
"1. `Construct knowledge graphs from text` using graph transformer and store integrations \n",
"2. `Query a graph database` using chains for query creation and execution\n",
"3. `Interact with a graph database` using agents for robust and flexible querying \n",
"\n",
"## Quickstart\n",
"\n",
"First, get required packages and set environment variables:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "975648da-b24f-4164-a671-6772179e12df",
"metadata": {},
"outputs": [],
"source": [
"!pip install langchain langchain-experimental openai neo4j wikipedia"
]
},
{
"cell_type": "markdown",
"id": "77718977-629e-46c2-b091-f9191b9ec569",
"metadata": {},
"source": [
"## Diffbot NLP Service\n",
"\n",
"Diffbot's NLP service is a tool for extracting entities, relationships, and semantic context from unstructured text data.\n",
"This extracted information can be used to construct a knowledge graph.\n",
"To use their service, you'll need to obtain an API key from [Diffbot](https://www.diffbot.com/products/natural-language/)."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "2cbf97d0-3682-439b-8750-b695ff726789",
"metadata": {},
"outputs": [],
"source": [
"from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer\n",
"\n",
"diffbot_api_key = \"DIFFBOT_API_KEY\"\n",
"diffbot_nlp = DiffbotGraphTransformer(diffbot_api_key=diffbot_api_key)"
]
},
{
"cell_type": "markdown",
"id": "5e3b894a-e3ee-46c7-8116-f8377f8f0159",
"metadata": {},
"source": [
"This code fetches Wikipedia articles about \"Baldur's Gate 3\" and then uses `DiffbotGraphTransformer` to extract entities and relationships.\n",
"The `DiffbotGraphTransformer` outputs a structured data `GraphDocument`, which can be used to populate a graph database.\n",
"Note that text chunking is avoided due to Diffbot's [character limit per API request](https://docs.diffbot.com/reference/introduction-to-natural-language-api)."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "53f8df86-47a1-44a1-9a0f-6725b90703bc",
"metadata": {},
"outputs": [],
"source": [
"from langchain.document_loaders import WikipediaLoader\n",
"\n",
"query = \"Warren Buffett\"\n",
"raw_documents = WikipediaLoader(query=query).load()\n",
"graph_documents = diffbot_nlp.convert_to_graph_documents(raw_documents)"
]
},
{
"cell_type": "markdown",
"id": "31bb851a-aab4-4b97-a6b7-fce397d32b47",
"metadata": {},
"source": [
"## Loading the data into a knowledge graph\n",
"\n",
"You will need to have a running Neo4j instance. One option is to create a [free Neo4j database instance in their Aura cloud service](https://neo4j.com/cloud/platform/aura-graph-database/). You can also run the database locally using the [Neo4j Desktop application](https://neo4j.com/download/), or running a docker container. You can run a local docker container by running the executing the following script:\n",
"```\n",
"docker run \\\n",
" --name neo4j \\\n",
" -p 7474:7474 -p 7687:7687 \\\n",
" -d \\\n",
" -e NEO4J_AUTH=neo4j/pleaseletmein \\\n",
" -e NEO4J_PLUGINS=\\[\\\"apoc\\\"\\] \\\n",
" neo4j:latest\n",
"``` \n",
"If you are using the docker container, you need to wait a couple of second for the database to start."
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "0b2b6641-5a5d-467c-b148-e6aad5e4baa7",
"metadata": {},
"outputs": [],
"source": [
"from langchain.graphs import Neo4jGraph\n",
"\n",
"url=\"bolt://localhost:7687\"\n",
"username=\"neo4j\"\n",
"password=\"pleaseletmein\"\n",
"\n",
"graph = Neo4jGraph(\n",
" url=url,\n",
" username=username, \n",
" password=password\n",
")"
]
},
{
"cell_type": "markdown",
"id": "0b15e840-fe6f-45db-9193-1b4e2df5c12c",
"metadata": {},
"source": [
"The `GraphDocuments` can be loaded into a knowledge graph using the `add_graph_documents` method."
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "1a67c4a8-955c-42a2-9c5d-de3ac0e640ec",
"metadata": {},
"outputs": [],
"source": [
"graph.add_graph_documents(graph_documents)"
]
},
{
"cell_type": "markdown",
"id": "ed411e05-2b03-460d-997e-938482774f40",
"metadata": {},
"source": [
"## Refresh graph schema information\n",
"If the schema of database changes, you can refresh the schema information needed to generate Cypher statements"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "904c9ee3-787c-403f-857d-459ce5ad5a1b",
"metadata": {},
"outputs": [],
"source": [
"graph.refresh_schema()"
]
},
{
"cell_type": "markdown",
"id": "f19d1387-5899-4258-8c94-8ef5fa7db464",
"metadata": {},
"source": [
"## Querying the graph\n",
"We can now use the graph cypher QA chain to ask question of the graph. It is advisable to use **gpt-4** to construct Cypher queries to get the best experience."
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "9393b732-67c8-45c1-9ec2-089f49c62448",
"metadata": {},
"outputs": [],
"source": [
"from langchain.chains import GraphCypherQAChain\n",
"from langchain.chat_models import ChatOpenAI\n",
"\n",
"chain = GraphCypherQAChain.from_llm(\n",
" cypher_llm=ChatOpenAI(temperature=0, model_name=\"gpt-4\"),\n",
" qa_llm=ChatOpenAI(temperature=0, model_name=\"gpt-3.5-turbo\"),\n",
" graph=graph, verbose=True,\n",
" \n",
")\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "1a9b3652-b436-404d-aa25-5fb576f23dc0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
"Generated Cypher:\n",
"\u001b[32;1m\u001b[1;3mMATCH (p:Person {name: \"Warren Buffett\"})-[:EDUCATED_AT]->(o:Organization)\n",
"RETURN o.name\u001b[0m\n",
"Full Context:\n",
"\u001b[32;1m\u001b[1;3m[{'o.name': 'New York Institute of Finance'}, {'o.name': 'Alice Deal Junior High School'}, {'o.name': 'Woodrow Wilson High School'}, {'o.name': 'University of Nebraska'}]\u001b[0m\n",
"\n",
"\u001b[1m> Finished chain.\u001b[0m\n"
]
},
{
"data": {
"text/plain": [
"'Warren Buffett attended the University of Nebraska.'"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chain.run(\"Which university did Warren Buffett attend?\")"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "adc0ba0f-a62c-4875-89ce-da717f3ab148",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\u001b[1m> Entering new GraphCypherQAChain chain...\u001b[0m\n",
"Generated Cypher:\n",
"\u001b[32;1m\u001b[1;3mMATCH (p:Person)-[r:EMPLOYEE_OR_MEMBER_OF]->(o:Organization) WHERE o.name = 'Berkshire Hathaway' RETURN p.name\u001b[0m\n",
"Full Context:\n",
"\u001b[32;1m\u001b[1;3m[{'p.name': 'Charlie Munger'}, {'p.name': 'Oliver Chace'}, {'p.name': 'Howard Buffett'}, {'p.name': 'Howard'}, {'p.name': 'Susan Buffett'}, {'p.name': 'Warren Buffett'}]\u001b[0m\n",
"\n",
"\u001b[1m> Finished chain.\u001b[0m\n"
]
},
{
"data": {
"text/plain": [
"'Charlie Munger, Oliver Chace, Howard Buffett, Susan Buffett, and Warren Buffett are or were working at Berkshire Hathaway.'"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"chain.run(\"Who is or was working at Berkshire Hathaway?\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d636954b-d967-4e96-9489-92e11c74af35",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.4"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

@ -1,7 +1,3 @@
---
sidebar_position: 0
---
# Self-checking
One of the main issues with using LLMs is that they can often hallucinate and make false claims. One of the surprisingly effective ways to remediate this is to use the LLM itself to check its own answers.

@ -0,0 +1,3 @@
label: 'QA over structured data'
collapsed: false
position: 0.5

@ -0,0 +1,158 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Elasticsearch\n",
"\n",
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/qa_structured/integrations/elasticsearch.ipynb)\n",
"\n",
"We can use LLMs to interact with Elasticsearch analytics databases in natural language.\n",
"\n",
"This chain builds search queries via the Elasticsearch DSL API (filters and aggregations).\n",
"\n",
"The Elasticsearch client must have permissions for index listing, mapping description and search queries.\n",
"\n",
"See [here](https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html) for instructions on how to run Elasticsearch locally."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"! pip install langchain langchain-experimental openai elasticsearch\n",
"\n",
"# Set env var OPENAI_API_KEY or load from a .env file\n",
"# import dotenv\n",
"\n",
"# dotenv.load_dotenv()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"from elasticsearch import Elasticsearch\n",
"\n",
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.chains.elasticsearch_database import ElasticsearchDatabaseChain"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Initialize Elasticsearch python client.\n",
"# See https://elasticsearch-py.readthedocs.io/en/v8.8.2/api.html#elasticsearch.Elasticsearch\n",
"ELASTIC_SEARCH_SERVER = \"https://elastic:pass@localhost:9200\"\n",
"db = Elasticsearch(ELASTIC_SEARCH_SERVER)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Uncomment the next cell to initially populate your db."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# customers = [\n",
"# {\"firstname\": \"Jennifer\", \"lastname\": \"Walters\"},\n",
"# {\"firstname\": \"Monica\",\"lastname\":\"Rambeau\"},\n",
"# {\"firstname\": \"Carol\",\"lastname\":\"Danvers\"},\n",
"# {\"firstname\": \"Wanda\",\"lastname\":\"Maximoff\"},\n",
"# {\"firstname\": \"Jennifer\",\"lastname\":\"Takeda\"},\n",
"# ]\n",
"# for i, customer in enumerate(customers):\n",
"# db.create(index=\"customers\", document=customer, id=i)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"llm = ChatOpenAI(model_name=\"gpt-4\", temperature=0)\n",
"chain = ElasticsearchDatabaseChain.from_llm(llm=llm, database=db, verbose=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"question = \"What are the first names of all the customers?\"\n",
"chain.run(question)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can customize the prompt."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from langchain.chains.elasticsearch_database.prompts import DEFAULT_DSL_TEMPLATE\n",
"from langchain.prompts.prompt import PromptTemplate\n",
"\n",
"PROMPT_TEMPLATE = \"\"\"Given an input question, create a syntactically correct Elasticsearch query to run. Unless the user specifies in their question a specific number of examples they wish to obtain, always limit your query to at most {top_k} results. You can order the results by a relevant column to return the most interesting examples in the database.\n",
"\n",
"Unless told to do not query for all the columns from a specific index, only ask for a the few relevant columns given the question.\n",
"\n",
"Pay attention to use only the column names that you can see in the mapping description. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which index. Return the query as valid json.\n",
"\n",
"Use the following format:\n",
"\n",
"Question: Question here\n",
"ESQuery: Elasticsearch Query formatted as json\n",
"\"\"\"\n",
"\n",
"PROMPT = PromptTemplate.from_template(\n",
" PROMPT_TEMPLATE,\n",
")\n",
"chain = ElasticsearchDatabaseChain.from_llm(llm=llm, database=db, query_prompt=PROMPT)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.1"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

@ -0,0 +1,200 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "245065c6",
"metadata": {},
"source": [
"# Vector SQL Retriever with MyScale\n",
"\n",
">[MyScale](https://docs.myscale.com/en/) is an integrated vector database. You can access your database in SQL and also from here, LangChain. MyScale can make a use of [various data types and functions for filters](https://blog.myscale.com/2023/06/06/why-integrated-database-solution-can-boost-your-llm-apps/#filter-on-anything-without-constraints). It will boost up your LLM app no matter if you are scaling up your data or expand your system to broader application."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0246c5bf",
"metadata": {},
"outputs": [],
"source": [
"!pip3 install clickhouse-sqlalchemy InstructorEmbedding sentence_transformers openai langchain-experimental"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7585d2c3",
"metadata": {},
"outputs": [],
"source": [
"\n",
"from os import environ\n",
"import getpass\n",
"from typing import Dict, Any\n",
"from langchain import OpenAI, SQLDatabase, LLMChain\n",
"from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n",
"from sqlalchemy import create_engine, Column, MetaData\n",
"from langchain import PromptTemplate\n",
"\n",
"\n",
"from sqlalchemy import create_engine\n",
"\n",
"MYSCALE_HOST = \"msc-1decbcc9.us-east-1.aws.staging.myscale.cloud\"\n",
"MYSCALE_PORT = 443\n",
"MYSCALE_USER = \"chatdata\"\n",
"MYSCALE_PASSWORD = \"myscale_rocks\"\n",
"OPENAI_API_KEY = getpass.getpass(\"OpenAI API Key:\")\n",
"\n",
"engine = create_engine(\n",
" f\"clickhouse://{MYSCALE_USER}:{MYSCALE_PASSWORD}@{MYSCALE_HOST}:{MYSCALE_PORT}/default?protocol=https\"\n",
")\n",
"metadata = MetaData(bind=engine)\n",
"environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e08d9ddc",
"metadata": {},
"outputs": [],
"source": [
"from langchain.embeddings import HuggingFaceInstructEmbeddings\n",
"from langchain_experimental.sql.vector_sql import VectorSQLOutputParser\n",
"\n",
"output_parser = VectorSQLOutputParser.from_embeddings(\n",
" model=HuggingFaceInstructEmbeddings(\n",
" model_name=\"hkunlp/instructor-xl\", model_kwargs={\"device\": \"cpu\"}\n",
" )\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "84b705b2",
"metadata": {},
"outputs": [],
"source": [
"\n",
"from langchain.llms import OpenAI\n",
"from langchain.callbacks import StdOutCallbackHandler\n",
"\n",
"from langchain.utilities.sql_database import SQLDatabase\n",
"from langchain_experimental.sql.prompt import MYSCALE_PROMPT\n",
"from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n",
"\n",
"chain = VectorSQLDatabaseChain(\n",
" llm_chain=LLMChain(\n",
" llm=OpenAI(openai_api_key=OPENAI_API_KEY, temperature=0),\n",
" prompt=MYSCALE_PROMPT,\n",
" ),\n",
" top_k=10,\n",
" return_direct=True,\n",
" sql_cmd_parser=output_parser,\n",
" database=SQLDatabase(engine, None, metadata),\n",
")\n",
"\n",
"import pandas as pd\n",
"\n",
"pd.DataFrame(\n",
" chain.run(\n",
" \"Please give me 10 papers to ask what is PageRank?\",\n",
" callbacks=[StdOutCallbackHandler()],\n",
" )\n",
")"
]
},
{
"cell_type": "markdown",
"id": "6c09cda0",
"metadata": {},
"source": [
"## SQL Database as Retriever"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "734d7ff5",
"metadata": {},
"outputs": [],
"source": [
"from langchain.chat_models import ChatOpenAI\n",
"from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain\n",
"\n",
"from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain\n",
"from langchain_experimental.retrievers.vector_sql_database \\\n",
" import VectorSQLDatabaseChainRetriever\n",
"from langchain_experimental.sql.prompt import MYSCALE_PROMPT\n",
"from langchain_experimental.sql.vector_sql import VectorSQLRetrieveAllOutputParser\n",
"\n",
"output_parser_retrieve_all = VectorSQLRetrieveAllOutputParser.from_embeddings(\n",
" output_parser.model\n",
")\n",
"\n",
"chain = VectorSQLDatabaseChain.from_llm(\n",
" llm=OpenAI(openai_api_key=OPENAI_API_KEY, temperature=0),\n",
" prompt=MYSCALE_PROMPT,\n",
" top_k=10,\n",
" return_direct=True,\n",
" db=SQLDatabase(engine, None, metadata),\n",
" sql_cmd_parser=output_parser_retrieve_all,\n",
" native_format=True,\n",
")\n",
"\n",
"# You need all those keys to get docs\n",
"retriever = VectorSQLDatabaseChainRetriever(sql_db_chain=chain, page_content_key=\"abstract\")\n",
"\n",
"document_with_metadata_prompt = PromptTemplate(\n",
" input_variables=[\"page_content\", \"id\", \"title\", \"authors\", \"pubdate\", \"categories\"],\n",
" template=\"Content:\\n\\tTitle: {title}\\n\\tAbstract: {page_content}\\n\\tAuthors: {authors}\\n\\tDate of Publication: {pubdate}\\n\\tCategories: {categories}\\nSOURCE: {id}\",\n",
")\n",
"\n",
"chain = RetrievalQAWithSourcesChain.from_chain_type(\n",
" ChatOpenAI(\n",
" model_name=\"gpt-3.5-turbo-16k\", openai_api_key=OPENAI_API_KEY, temperature=0.6\n",
" ),\n",
" retriever=retriever,\n",
" chain_type=\"stuff\",\n",
" chain_type_kwargs={\n",
" \"document_prompt\": document_with_metadata_prompt,\n",
" },\n",
" return_source_documents=True,\n",
")\n",
"ans = chain(\"Please give me 10 papers to ask what is PageRank?\",\n",
" callbacks=[StdOutCallbackHandler()])\n",
"print(ans[\"answer\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4948ff25",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

@ -1,12 +1,20 @@
{
"cells": [
{
"cell_type": "raw",
"metadata": {},
"source": [
"---\n",
"title: SQL\n",
"sidebar_position: 2\n",
"---"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# SQL\n",
"\n",
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/sql.ipynb)\n",
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/qa_structured/sql.ipynb)\n",
"\n",
"## Use case\n",
"\n",
@ -713,6 +721,391 @@
"agent_executor.run(\"Describe the playlisttrack table\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Extending the SQL Toolkit\n",
"\n",
"Although the out-of-the-box SQL Toolkit contains the necessary tools to start working on a database, it is often the case that some extra tools may be useful for extending the agent's capabilities. This is particularly useful when trying to use **domain specific knowledge** in the solution, in order to improve its overall performance.\n",
"\n",
"Some examples include:\n",
"\n",
"- Including dynamic few shot examples\n",
"- Finding misspellings in proper nouns to use as column filters\n",
"\n",
"We can create separate tools which tackle these specific use cases and include them as a complement to the standard SQL Toolkit. Let's see how to include these two custom tools.\n",
"\n",
"#### Including dynamic few-shot examples\n",
"\n",
"In order to include dynamic few-shot examples, we need a custom **Retriever Tool** that handles the vector database in order to retrieve the examples that are semantically similar to the users question.\n",
"\n",
"Let's start by creating a dictionary with some examples: "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# few_shots = {'List all artists.': 'SELECT * FROM artists;',\n",
"# \"Find all albums for the artist 'AC/DC'.\": \"SELECT * FROM albums WHERE ArtistId = (SELECT ArtistId FROM artists WHERE Name = 'AC/DC');\",\n",
"# \"List all tracks in the 'Rock' genre.\": \"SELECT * FROM tracks WHERE GenreId = (SELECT GenreId FROM genres WHERE Name = 'Rock');\",\n",
"# 'Find the total duration of all tracks.': 'SELECT SUM(Milliseconds) FROM tracks;',\n",
"# 'List all customers from Canada.': \"SELECT * FROM customers WHERE Country = 'Canada';\",\n",
"# 'How many tracks are there in the album with ID 5?': 'SELECT COUNT(*) FROM tracks WHERE AlbumId = 5;',\n",
"# 'Find the total number of invoices.': 'SELECT COUNT(*) FROM invoices;',\n",
"# 'List all tracks that are longer than 5 minutes.': 'SELECT * FROM tracks WHERE Milliseconds > 300000;',\n",
"# 'Who are the top 5 customers by total purchase?': 'SELECT CustomerId, SUM(Total) AS TotalPurchase FROM invoices GROUP BY CustomerId ORDER BY TotalPurchase DESC LIMIT 5;',\n",
"# 'Which albums are from the year 2000?': \"SELECT * FROM albums WHERE strftime('%Y', ReleaseDate) = '2000';\",\n",
"# 'How many employees are there': 'SELECT COUNT(*) FROM \"employee\"'\n",
"# }"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can then create a retriever using the list of questions, assigning the target SQL query as metadata:"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.vectorstores import FAISS\n",
"from langchain.schema import Document\n",
"\n",
"embeddings = OpenAIEmbeddings()\n",
"\n",
"few_shot_docs = [Document(page_content=question, metadata={'sql_query': few_shots[question]}) for question in few_shots.keys()]\n",
"vector_db = FAISS.from_documents(few_shot_docs, embeddings)\n",
"retriever = vector_db.as_retriever()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we can create our own custom tool and append it as a new tool in the `create_sql_agent` function:"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"from langchain.agents.agent_toolkits import create_retriever_tool\n",
"\n",
"tool_description = \"\"\"\n",
"This tool will help you understand similar examples to adapt them to the user question.\n",
"Input to this tool should be the user question.\n",
"\"\"\"\n",
"\n",
"retriever_tool = create_retriever_tool(\n",
" retriever,\n",
" name='sql_get_similar_examples',\n",
" description=tool_description\n",
" )\n",
"custom_tool_list = [retriever_tool]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we can create the agent, adjusting the standard SQL Agent suffix to consider our use case. Although the most straightforward way to handle this would be to include it just in the tool description, this is often not enough and we need to specify it in the agent prompt using the `suffix` argument in the constructor."
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"from langchain.agents import create_sql_agent, AgentType\n",
"from langchain.agents.agent_toolkits import SQLDatabaseToolkit\n",
"from langchain.utilities import SQLDatabase\n",
"from langchain.chat_models import ChatOpenAI\n",
"\n",
"db = SQLDatabase.from_uri(\"sqlite:///Chinook.db\")\n",
"llm = ChatOpenAI(model_name='gpt-4',temperature=0)\n",
"\n",
"toolkit = SQLDatabaseToolkit(db=db, llm=llm)\n",
"\n",
"custom_suffix = \"\"\"\n",
"I should first get the similar examples I know.\n",
"If the examples are enough to construct the query, I can build it.\n",
"Otherwise, I can then look at the tables in the database to see what I can query.\n",
"Then I should query the schema of the most relevant tables\n",
"\"\"\"\n",
"\n",
"agent = create_sql_agent(llm=llm,\n",
" toolkit=toolkit,\n",
" verbose=True,\n",
" agent_type=AgentType.OPENAI_FUNCTIONS,\n",
" extra_tools=custom_tool_list,\n",
" suffix=custom_suffix\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's try it out:"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
"\u001b[32;1m\u001b[1;3m\n",
"Invoking: `sql_get_similar_examples` with `How many employees do we have?`\n",
"\n",
"\n",
"\u001b[0m\u001b[33;1m\u001b[1;3m[Document(page_content='How many employees are there', metadata={'sql_query': 'SELECT COUNT(*) FROM \"employee\"'}), Document(page_content='Find the total number of invoices.', metadata={'sql_query': 'SELECT COUNT(*) FROM invoices;'})]\u001b[0m\u001b[32;1m\u001b[1;3m\n",
"Invoking: `sql_db_query_checker` with `SELECT COUNT(*) FROM employee`\n",
"responded: {content}\n",
"\n",
"\u001b[0m\u001b[36;1m\u001b[1;3mSELECT COUNT(*) FROM employee\u001b[0m\u001b[32;1m\u001b[1;3m\n",
"Invoking: `sql_db_query` with `SELECT COUNT(*) FROM employee`\n",
"\n",
"\n",
"\u001b[0m\u001b[36;1m\u001b[1;3m[(8,)]\u001b[0m\u001b[32;1m\u001b[1;3mWe have 8 employees.\u001b[0m\n",
"\n",
"\u001b[1m> Finished chain.\u001b[0m\n"
]
},
{
"data": {
"text/plain": [
"'We have 8 employees.'"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agent.run(\"How many employees do we have?\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As we can see, the agent first used the `sql_get_similar_examples` tool in order to retrieve similar examples. As the question was very similar to other few shot examples, the agent **didn't need to use any other tool** from the standard Toolkit, thus **saving time and tokens**."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"#### Finding and correcting misspellings for proper nouns\n",
"\n",
"In order to filter columns that contain proper nouns such as addresses, song names or artists, we first need to double-check the spelling in order to filter the data correctly. \n",
"\n",
"We can achieve this by creating a vector store using all the distinct proper nouns that exist in the database. We can then have the agent query that vector store each time the user includes a proper noun in their question, to find the correct spelling for that word. In this way, the agent can make sure it understands which entity the user is referring to before building the target query.\n",
"\n",
"Let's follow a similar approach to the few shots, but without metadata: just embedding the proper nouns and then querying to get the most similar one to the misspelled user question.\n",
"\n",
"First we need the unique values for each entity we want, for which we define a function that parses the result into a list of elements:"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"import re\n",
"\n",
"def run_query_save_results(db, query):\n",
" res = db.run(query)\n",
" res = [el for sub in ast.literal_eval(res) for el in sub if el]\n",
" res = [re.sub(r'\\b\\d+\\b', '', string).strip() for string in res]\n",
" return res\n",
"\n",
"artists = run_query_save_results(db, \"SELECT Name FROM Artist\")\n",
"albums = run_query_save_results(db, \"SELECT Title FROM Album\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Now we can proceed with creating the custom **retreiver tool** and the final agent:"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"from langchain.agents.agent_toolkits import create_retriever_tool\n",
"from langchain.embeddings.openai import OpenAIEmbeddings\n",
"from langchain.vectorstores import FAISS\n",
"\n",
"\n",
"texts = (artists + albums)\n",
"\n",
"embeddings = OpenAIEmbeddings()\n",
"vector_db = FAISS.from_texts(texts, embeddings)\n",
"retriever = vector_db.as_retriever()\n",
"\n",
"retriever_tool = create_retriever_tool(\n",
" retriever,\n",
" name='name_search',\n",
" description='use to learn how a piece of data is actually written, can be from names, surnames addresses etc'\n",
" )\n",
"\n",
"custom_tool_list = [retriever_tool]"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"from langchain.agents import create_sql_agent, AgentType\n",
"from langchain.agents.agent_toolkits import SQLDatabaseToolkit\n",
"from langchain.utilities import SQLDatabase\n",
"from langchain.chat_models import ChatOpenAI\n",
"\n",
"# db = SQLDatabase.from_uri(\"sqlite:///Chinook.db\")\n",
"llm = ChatOpenAI(model_name='gpt-4', temperature=0)\n",
"\n",
"toolkit = SQLDatabaseToolkit(db=db, llm=llm)\n",
"\n",
"custom_suffix = \"\"\"\n",
"If a user asks for me to filter based on proper nouns, I should first check the spelling using the name_search tool.\n",
"Otherwise, I can then look at the tables in the database to see what I can query.\n",
"Then I should query the schema of the most relevant tables\n",
"\"\"\"\n",
"\n",
"agent = create_sql_agent(llm=llm,\n",
" toolkit=toolkit,\n",
" verbose=True,\n",
" agent_type=AgentType.OPENAI_FUNCTIONS,\n",
" extra_tools=custom_tool_list,\n",
" suffix=custom_suffix\n",
" )"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's try it out:"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
"\u001b[32;1m\u001b[1;3m\n",
"Invoking: `name_search` with `alis in pains`\n",
"\n",
"\n",
"\u001b[0m\u001b[33;1m\u001b[1;3m[Document(page_content='House of Pain', metadata={}), Document(page_content='Alice In Chains', metadata={}), Document(page_content='Aisha Duo', metadata={}), Document(page_content='House Of Pain', metadata={})]\u001b[0m\u001b[32;1m\u001b[1;3m\n",
"Invoking: `sql_db_list_tables` with ``\n",
"responded: {content}\n",
"\n",
"\u001b[0m\u001b[38;5;200m\u001b[1;3mAlbum, Artist, Customer, Employee, Genre, Invoice, InvoiceLine, MediaType, Playlist, PlaylistTrack, Track\u001b[0m\u001b[32;1m\u001b[1;3m\n",
"Invoking: `sql_db_schema` with `Album, Artist`\n",
"responded: {content}\n",
"\n",
"\u001b[0m\u001b[33;1m\u001b[1;3m\n",
"CREATE TABLE \"Album\" (\n",
"\t\"AlbumId\" INTEGER NOT NULL, \n",
"\t\"Title\" NVARCHAR(160) NOT NULL, \n",
"\t\"ArtistId\" INTEGER NOT NULL, \n",
"\tPRIMARY KEY (\"AlbumId\"), \n",
"\tFOREIGN KEY(\"ArtistId\") REFERENCES \"Artist\" (\"ArtistId\")\n",
")\n",
"\n",
"/*\n",
"3 rows from Album table:\n",
"AlbumId\tTitle\tArtistId\n",
"1\tFor Those About To Rock We Salute You\t1\n",
"2\tBalls to the Wall\t2\n",
"3\tRestless and Wild\t2\n",
"*/\n",
"\n",
"\n",
"CREATE TABLE \"Artist\" (\n",
"\t\"ArtistId\" INTEGER NOT NULL, \n",
"\t\"Name\" NVARCHAR(120), \n",
"\tPRIMARY KEY (\"ArtistId\")\n",
")\n",
"\n",
"/*\n",
"3 rows from Artist table:\n",
"ArtistId\tName\n",
"1\tAC/DC\n",
"2\tAccept\n",
"3\tAerosmith\n",
"*/\u001b[0m\u001b[32;1m\u001b[1;3m\n",
"Invoking: `sql_db_query_checker` with `SELECT COUNT(*) FROM Album JOIN Artist ON Album.ArtistId = Artist.ArtistId WHERE Artist.Name = 'Alice In Chains'`\n",
"responded: {content}\n",
"\n",
"\u001b[0m\u001b[36;1m\u001b[1;3mSELECT COUNT(*) FROM Album JOIN Artist ON Album.ArtistId = Artist.ArtistId WHERE Artist.Name = 'Alice In Chains'\u001b[0m\u001b[32;1m\u001b[1;3m\n",
"Invoking: `sql_db_query` with `SELECT COUNT(*) FROM Album JOIN Artist ON Album.ArtistId = Artist.ArtistId WHERE Artist.Name = 'Alice In Chains'`\n",
"\n",
"\n",
"\u001b[0m\u001b[36;1m\u001b[1;3m[(1,)]\u001b[0m\u001b[32;1m\u001b[1;3mAlice In Chains has 1 album in the database.\u001b[0m\n",
"\n",
"\u001b[1m> Finished chain.\u001b[0m\n"
]
},
{
"data": {
"text/plain": [
"'Alice In Chains has 1 album in the database.'"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"agent.run(\"How many albums does alis in pains have?\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"As we can see, the agent used the `name_search` tool in order to check how to correctly query the database for this specific artist."
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -867,7 +1260,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
"version": "3.9.1"
}
},
"nbformat": 4,

@ -42,7 +42,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 1,
"id": "f8cf5765",
"metadata": {},
"outputs": [],
@ -68,7 +68,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 2,
"id": "fdce8923",
"metadata": {},
"outputs": [
@ -83,7 +83,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"objc[31511]: Class GGMLMetalClass is implemented in both /Users/rlm/miniforge3/envs/llama2/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libreplit-mainline-metal.dylib (0x14f4e8208) and /Users/rlm/miniforge3/envs/llama2/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libllamamodel-mainline-metal.dylib (0x14f5fc208). One of the two will be used. Which one is undefined.\n"
"objc[49534]: Class GGMLMetalClass is implemented in both /Users/rlm/miniforge3/envs/llama2/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libreplit-mainline-metal.dylib (0x131614208) and /Users/rlm/miniforge3/envs/llama2/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libllamamodel-mainline-metal.dylib (0x131988208). One of the two will be used. Which one is undefined.\n"
]
}
],
@ -104,7 +104,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 3,
"id": "b0c55e98",
"metadata": {},
"outputs": [
@ -114,7 +114,7 @@
"4"
]
},
"execution_count": 6,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@ -204,7 +204,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": 4,
"id": "cd7164e3",
"metadata": {},
"outputs": [],
@ -225,7 +225,7 @@
{
"cell_type": "code",
"execution_count": null,
"id": "56158f83-6490-49b8-9f04-2e2e6ec3524b",
"id": "af1176bb-d52a-4cf0-b983-8b7433d45b4f",
"metadata": {},
"outputs": [],
"source": [
@ -459,12 +459,11 @@
{
"cell_type": "code",
"execution_count": null,
"id": "4ae37573-63a7-4564-90e1-196a8ea9b526",
"id": "cc638992-0924-41c0-8dae-8cf683e72b16",
"metadata": {},
"outputs": [],
"source": [
"from langchain import hub\n",
"rag_prompt = hub.pull(\"rlm/rag-prompt-default\")"
"pip install langchainhub"
]
},
{
@ -512,6 +511,9 @@
}
],
"source": [
"# Prompt \n",
"from langchain import hub\n",
"rag_prompt = hub.pull(\"rlm/rag-prompt\")\n",
"from langchain.chains.question_answering import load_qa_chain\n",
"# Chain\n",
"chain = load_qa_chain(llm, chain_type=\"stuff\", prompt=rag_prompt)\n",
@ -529,7 +531,7 @@
},
{
"cell_type": "code",
"execution_count": 31,
"execution_count": 3,
"id": "78f6862d-b7a6-4e03-84e4-45667185bf9b",
"metadata": {},
"outputs": [
@ -539,12 +541,13 @@
"ChatPromptTemplate(input_variables=['question', 'context'], output_parser=None, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['question', 'context'], output_parser=None, partial_variables={}, template=\"[INST]<<SYS>> You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.<</SYS>> \\nQuestion: {question} \\nContext: {context} \\nAnswer: [/INST]\", template_format='f-string', validate_template=True), additional_kwargs={})])"
]
},
"execution_count": 31,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Prompt\n",
"rag_prompt_llama = hub.pull(\"rlm/rag-prompt-llama\")\n",
"rag_prompt_llama"
]

@ -52,7 +52,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 1,
"id": "046cefc0",
"metadata": {},
"outputs": [],
@ -269,28 +269,10 @@
},
{
"cell_type": "code",
"execution_count": 9,
"id": "c690f01a",
"execution_count": null,
"id": "9cfe3270-4e89-4c60-a2e5-9026b021bf76",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"INFO:langchain.retrievers.multi_query:Generated queries: ['1. How can Task Decomposition be approached?', '2. What are the different methods for Task Decomposition?', '3. What are the various approaches to decomposing tasks?']\n"
]
},
{
"data": {
"text/plain": [
"4"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"import logging\n",
"\n",
@ -318,7 +300,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 9,
"id": "99fa1aec",
"metadata": {},
"outputs": [
@ -326,10 +308,10 @@
"data": {
"text/plain": [
"{'query': 'What are the approaches to Task Decomposition?',\n",
" 'result': 'There are three approaches to task decomposition:\\n\\n1. Using Language Model with simple prompting: This approach involves using a Language Model (LLM) with simple prompts like \"Steps for XYZ\" or \"What are the subgoals for achieving XYZ?\" to guide the task decomposition process.\\n\\n2. Using task-specific instructions: In this approach, task-specific instructions are provided to guide the task decomposition. For example, for the task of writing a novel, an instruction like \"Write a story outline\" can be given to help decompose the task into smaller subtasks.\\n\\n3. Human inputs: Task decomposition can also be done with the help of human inputs. This involves getting input and guidance from humans to break down a complex task into smaller, more manageable subtasks.'}"
" 'result': 'The approaches to task decomposition include:\\n\\n1. Simple prompting: This approach involves using simple prompts or questions to guide the agent in breaking down a task into smaller subgoals. For example, the agent can be prompted with \"Steps for XYZ\" or \"What are the subgoals for achieving XYZ?\" to facilitate task decomposition.\\n\\n2. Task-specific instructions: In this approach, task-specific instructions are provided to the agent to guide the decomposition process. For example, if the task is to write a novel, the agent can be instructed to \"Write a story outline\" as a step in the task decomposition.\\n\\n3. Human inputs: This approach involves incorporating human inputs in the task decomposition process. Humans can provide guidance, feedback, and assistance to the agent in breaking down complex tasks into manageable subgoals.\\n\\nThese approaches aim to enable efficient handling of complex tasks by breaking them down into smaller, more manageable subgoals.'}"
]
},
"execution_count": 10,
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@ -355,97 +337,7 @@
"#### Choosing LLMs\n",
"- Browse the > 55 LLM and chat model integrations [here](https://integrations.langchain.com/).\n",
"- See further documentation on LLMs and chat models [here](/docs/modules/model_io/models/).\n",
"- Use local LLMS: The popularity of [PrivateGPT](https://github.com/imartinez/privateGPT) and [GPT4All](https://github.com/nomic-ai/gpt4all) underscore the importance of running LLMs locally.\n",
"Using `GPT4All` is as simple as [downloading the binary]((/docs/integrations/llms/gpt4all)) and then:"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "02d6c9dc",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Found model file at /Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"objc[61331]: Class GGMLMetalClass is implemented in both /Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libreplit-mainline-metal.dylib (0x2e3384208) and /Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/libllamamodel-mainline-metal.dylib (0x2e37b0208). One of the two will be used. Which one is undefined.\n",
"llama.cpp: using Metal\n",
"llama.cpp: loading model from /Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin\n",
"llama_model_load_internal: format = ggjt v3 (latest)\n",
"llama_model_load_internal: n_vocab = 32001\n",
"llama_model_load_internal: n_ctx = 2048\n",
"llama_model_load_internal: n_embd = 5120\n",
"llama_model_load_internal: n_mult = 256\n",
"llama_model_load_internal: n_head = 40\n",
"llama_model_load_internal: n_layer = 40\n",
"llama_model_load_internal: n_rot = 128\n",
"llama_model_load_internal: ftype = 2 (mostly Q4_0)\n",
"llama_model_load_internal: n_ff = 13824\n",
"llama_model_load_internal: n_parts = 1\n",
"llama_model_load_internal: model size = 13B\n",
"llama_model_load_internal: ggml ctx size = 0.09 MB\n",
"llama_model_load_internal: mem required = 9031.71 MB (+ 1608.00 MB per state)\n",
"llama_new_context_with_model: kv self size = 1600.00 MB\n",
"ggml_metal_init: allocating\n",
"ggml_metal_init: using MPS\n",
"ggml_metal_init: loading '/Users/rlm/miniforge3/envs/llama/lib/python3.9/site-packages/gpt4all/llmodel_DO_NOT_MODIFY/build/ggml-metal.metal'\n",
"ggml_metal_init: loaded kernel_add 0x2bbbbc2f0\n",
"ggml_metal_init: loaded kernel_mul 0x2bbbba840\n",
"ggml_metal_init: loaded kernel_mul_row 0x2bb917dd0\n",
"ggml_metal_init: loaded kernel_scale 0x2bb918150\n",
"ggml_metal_init: loaded kernel_silu 0x2bb9184d0\n",
"ggml_metal_init: loaded kernel_relu 0x2bb918850\n",
"ggml_metal_init: loaded kernel_gelu 0x2bbbc3f10\n",
"ggml_metal_init: loaded kernel_soft_max 0x2bbbc5840\n",
"ggml_metal_init: loaded kernel_diag_mask_inf 0x2bbbc4c70\n",
"ggml_metal_init: loaded kernel_get_rows_f16 0x2bbbc5fc0\n",
"ggml_metal_init: loaded kernel_get_rows_q4_0 0x2bbbc6720\n",
"ggml_metal_init: loaded kernel_get_rows_q4_1 0x2bb918c10\n",
"ggml_metal_init: loaded kernel_get_rows_q2_k 0x2bbbc51b0\n",
"ggml_metal_init: loaded kernel_get_rows_q3_k 0x2bbbc7630\n",
"ggml_metal_init: loaded kernel_get_rows_q4_k 0x2d4394e30\n",
"ggml_metal_init: loaded kernel_get_rows_q5_k 0x2bbbc7890\n",
"ggml_metal_init: loaded kernel_get_rows_q6_k 0x2d4395210\n",
"ggml_metal_init: loaded kernel_rms_norm 0x2bbbc8740\n",
"ggml_metal_init: loaded kernel_norm 0x2bbbc8b30\n",
"ggml_metal_init: loaded kernel_mul_mat_f16_f32 0x2d4395470\n",
"ggml_metal_init: loaded kernel_mul_mat_q4_0_f32 0x2d4395a70\n",
"ggml_metal_init: loaded kernel_mul_mat_q4_1_f32 0x1242b1a00\n",
"ggml_metal_init: loaded kernel_mul_mat_q2_k_f32 0x29f17d1c0\n",
"ggml_metal_init: loaded kernel_mul_mat_q3_k_f32 0x2d4396050\n",
"ggml_metal_init: loaded kernel_mul_mat_q4_k_f32 0x2bbbc98a0\n",
"ggml_metal_init: loaded kernel_mul_mat_q5_k_f32 0x2bbbca4a0\n",
"ggml_metal_init: loaded kernel_mul_mat_q6_k_f32 0x2bbbcae90\n",
"ggml_metal_init: loaded kernel_rope 0x2bbbca700\n",
"ggml_metal_init: loaded kernel_alibi_f32 0x2bbbcc6e0\n",
"ggml_metal_init: loaded kernel_cpy_f32_f16 0x2bbbccf90\n",
"ggml_metal_init: loaded kernel_cpy_f32_f32 0x2bbbcd900\n",
"ggml_metal_init: loaded kernel_cpy_f16_f16 0x2bbbce1f0\n",
"ggml_metal_init: recommendedMaxWorkingSetSize = 21845.34 MB\n",
"ggml_metal_init: hasUnifiedMemory = true\n",
"ggml_metal_init: maxTransferRate = built-in GPU\n",
"ggml_metal_add_buffer: allocated 'data ' buffer, size = 6984.06 MB, ( 6984.45 / 21845.34)\n",
"ggml_metal_add_buffer: allocated 'eval ' buffer, size = 1024.00 MB, ( 8008.45 / 21845.34)\n",
"ggml_metal_add_buffer: allocated 'kv ' buffer, size = 1602.00 MB, ( 9610.45 / 21845.34)\n",
"ggml_metal_add_buffer: allocated 'scr0 ' buffer, size = 512.00 MB, (10122.45 / 21845.34)\n",
"ggml_metal_add_buffer: allocated 'scr1 ' buffer, size = 512.00 MB, (10634.45 / 21845.34)\n"
]
}
],
"source": [
"from langchain.llms import GPT4All\n",
"from langchain.chains import RetrievalQA\n",
"\n",
"llm = GPT4All(model=\"/Users/rlm/Desktop/Code/gpt4all/models/nous-hermes-13b.ggmlv3.q4_0.bin\",max_tokens=2048)\n",
"qa_chain = RetrievalQA.from_chain_type(llm, retriever=vectorstore.as_retriever())"
"- See a guide on local LLMS [here](/docs/modules/use_cases/question_answering/how_to/local_retrieval_qa)."
]
},
{
@ -460,24 +352,17 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 10,
"id": "e4fee704",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"ggml_metal_free: deallocating\n"
]
},
{
"data": {
"text/plain": [
"'The approaches to task decomposition include using LLM with simple prompting, task-specific instructions, or human inputs. Thanks for asking!'"
"'The approaches to Task Decomposition are (1) using simple prompting by LLM, (2) using task-specific instructions, and (3) incorporating human inputs. Thanks for asking!'"
]
},
"execution_count": 13,
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@ -507,8 +392,65 @@
},
{
"cell_type": "markdown",
"id": "ff40e8db",
"id": "c825e9bf-6a56-46e4-8bbb-05441f76cb96",
"metadata": {},
"source": [
"We can also store and fetch prompts from the LangChain prompt hub.\n",
"\n",
"This will work with your [LangSmith API key](https://docs.smith.langchain.com/).\n",
"\n",
"For example, see [here](https://smith.langchain.com/hub/rlm/rag-prompt) is a common prompt for RAG.\n",
"\n",
"We can load this."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a896060f-ebc4-4236-a4ad-32960601c6e8",
"metadata": {},
"outputs": [],
"source": [
"pip install langchainhub"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "aef8e734-ba54-48ae-b959-1898618f2d90",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'The approaches to task decomposition include using LLM with simple prompting, task-specific instructions, and human inputs.'"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# RAG prompt\n",
"from langchain import hub\n",
"QA_CHAIN_PROMPT_HUB = hub.pull(\"rlm/rag-prompt\")\n",
"\n",
"qa_chain = RetrievalQA.from_chain_type(\n",
" llm,\n",
" retriever=vectorstore.as_retriever(),\n",
" chain_type_kwargs={\"prompt\": QA_CHAIN_PROMPT_HUB}\n",
")\n",
"result = qa_chain({\"query\": question})\n",
"result[\"result\"]"
]
},
{
"cell_type": "markdown",
"id": "ff40e8db",
"metadata": {
"jp-MarkdownHeadingCollapsed": true
},
"source": [
"#### Return source documents\n",
"\n",

@ -1,12 +1,21 @@
{
"cells": [
{
"cell_type": "raw",
"id": "2aca8168-62ec-4bba-93f0-73da08cd1920",
"metadata": {},
"source": [
"---\n",
"sidebar_position: 1\n",
"title: Summarization\n",
"---"
]
},
{
"cell_type": "markdown",
"id": "cf13f702",
"metadata": {},
"source": [
"# Summarization\n",
"\n",
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/summarization.ipynb)\n",
"\n",
"## Use case\n",
@ -548,7 +557,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
"version": "3.9.1"
}
},
"nbformat": 4,

@ -1,12 +1,21 @@
{
"cells": [
{
"cell_type": "raw",
"id": "cb6f552e-775f-4d84-bc7c-dca94c06a33c",
"metadata": {},
"source": [
"---\n",
"sidebar_position: 1\n",
"title: Tagging\n",
"---"
]
},
{
"cell_type": "markdown",
"id": "a0507a4b",
"metadata": {},
"source": [
"# Tagging\n",
"\n",
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/tagging.ipynb)\n",
"\n",
"## Use case\n",
@ -408,7 +417,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
"version": "3.9.1"
}
},
"nbformat": 4,

@ -1,12 +1,21 @@
{
"cells": [
{
"cell_type": "raw",
"id": "e254cf03-49fc-4051-a4df-3a8e4e7d2688",
"metadata": {},
"source": [
"---\n",
"sidebar_position: 1\n",
"title: Web scraping\n",
"---"
]
},
{
"cell_type": "markdown",
"id": "6605e7f7",
"metadata": {},
"source": [
"# Web Scraping\n",
"\n",
"[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/web_scraping.ipynb)\n",
"\n",
"## Use case\n",
@ -306,9 +315,7 @@
"cell_type": "code",
"execution_count": 7,
"id": "977560ba",
"metadata": {
"scrolled": false
},
"metadata": {},
"outputs": [
{
"name": "stdout",
@ -591,7 +598,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
"version": "3.9.1"
}
},
"nbformat": 4,

@ -5,10 +5,12 @@ pip install openai google-search-results
```
```python
from langchain import LLMMathChain, OpenAI, SerpAPIWrapper, SQLDatabase, SQLDatabaseChain
from langchain.agents import initialize_agent, Tool
from langchain.agents import AgentType
from langchain.agents import initialize_agent, AgentType, Tool
from langchain.chains import LLMMathChain
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.utilities import SerpAPIWrapper, SQLDatabase
from langchain_experimental.sql import SQLDatabaseChain
```

@ -1,4 +1,7 @@
"""Data anonymizer package"""
from langchain_experimental.data_anonymizer.presidio import PresidioAnonymizer
from langchain_experimental.data_anonymizer.presidio import (
PresidioAnonymizer,
PresidioReversibleAnonymizer,
)
__all__ = ["PresidioAnonymizer"]
__all__ = ["PresidioAnonymizer", "PresidioReversibleAnonymizer"]

@ -15,3 +15,17 @@ class AnonymizerBase(ABC):
@abstractmethod
def _anonymize(self, text: str) -> str:
"""Abstract method to anonymize text"""
class ReversibleAnonymizerBase(AnonymizerBase):
"""
Base abstract class for reversible anonymizers.
"""
def deanonymize(self, text: str) -> str:
"""Deanonymize text"""
return self._deanonymize(text)
@abstractmethod
def _deanonymize(self, text: str) -> str:
"""Abstract method to deanonymize text"""

@ -0,0 +1,21 @@
from collections import defaultdict
from dataclasses import dataclass, field
from typing import Dict
MappingDataType = Dict[str, Dict[str, str]]
@dataclass
class DeanonymizerMapping:
mapping: MappingDataType = field(
default_factory=lambda: defaultdict(lambda: defaultdict(str))
)
@property
def data(self) -> MappingDataType:
"""Return the deanonymizer mapping"""
return {k: dict(v) for k, v in self.mapping.items()}
def update(self, new_mapping: MappingDataType) -> None:
for entity_type, values in new_mapping.items():
self.mapping[entity_type].update(values)

@ -0,0 +1,17 @@
from langchain_experimental.data_anonymizer.presidio import MappingDataType
def default_matching_strategy(text: str, deanonymizer_mapping: MappingDataType) -> str:
"""
Default matching strategy for deanonymization.
It replaces all the anonymized entities with the original ones.
Args:
text: text to deanonymize
deanonymizer_mapping: mapping between anonymized entities and original ones"""
# Iterate over all the entities (PERSON, EMAIL_ADDRESS, etc.)
for entity_type in deanonymizer_mapping:
for anonymized, original in deanonymizer_mapping[entity_type].items():
text = text.replace(anonymized, original)
return text

@ -1,8 +1,8 @@
import string
from typing import Callable, Dict
from typing import Callable, Dict, Optional
def get_pseudoanonymizer_mapping() -> Dict[str, Callable]:
def get_pseudoanonymizer_mapping(seed: Optional[int] = None) -> Dict[str, Callable]:
try:
from faker import Faker
except ImportError as e:
@ -11,6 +11,7 @@ def get_pseudoanonymizer_mapping() -> Dict[str, Callable]:
) from e
fake = Faker()
fake.seed_instance(seed)
# Listed entities supported by Microsoft Presidio (for now, global and US only)
# Source: https://microsoft.github.io/presidio/supported_entities/

@ -1,24 +1,56 @@
from __future__ import annotations
from typing import TYPE_CHECKING, Dict, List, Optional
import json
from collections import defaultdict
from pathlib import Path
from typing import TYPE_CHECKING, Callable, Dict, List, Optional, Union
from langchain_experimental.data_anonymizer.base import AnonymizerBase
import yaml
from langchain_experimental.data_anonymizer.base import (
AnonymizerBase,
ReversibleAnonymizerBase,
)
from langchain_experimental.data_anonymizer.deanonymizer_mapping import (
DeanonymizerMapping,
MappingDataType,
)
from langchain_experimental.data_anonymizer.deanonymizer_matching_strategies import (
default_matching_strategy,
)
from langchain_experimental.data_anonymizer.faker_presidio_mapping import (
get_pseudoanonymizer_mapping,
)
if TYPE_CHECKING:
from presidio_analyzer import EntityRecognizer
try:
from presidio_analyzer import AnalyzerEngine
except ImportError as e:
raise ImportError(
"Could not import presidio_analyzer, please install with "
"`pip install presidio-analyzer`. You will also need to download a "
"spaCy model to use the analyzer, e.g. "
"`python -m spacy download en_core_web_lg`."
) from e
try:
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
except ImportError as e:
raise ImportError(
"Could not import presidio_anonymizer, please install with "
"`pip install presidio-anonymizer`."
) from e
if TYPE_CHECKING:
from presidio_analyzer import EntityRecognizer, RecognizerResult
from presidio_anonymizer.entities import EngineResult
class PresidioAnonymizer(AnonymizerBase):
"""Anonymizer using Microsoft Presidio."""
class PresidioAnonymizerBase(AnonymizerBase):
def __init__(
self,
analyzed_fields: Optional[List[str]] = None,
operators: Optional[Dict[str, OperatorConfig]] = None,
faker_seed: Optional[int] = None,
):
"""
Args:
@ -28,25 +60,10 @@ class PresidioAnonymizer(AnonymizerBase):
Operators allow for custom anonymization of detected PII.
Learn more:
https://microsoft.github.io/presidio/tutorial/10_simple_anonymization/
faker_seed: Seed used to initialize faker.
Defaults to None, in which case faker will be seeded randomly
and provide random values.
"""
try:
from presidio_analyzer import AnalyzerEngine
except ImportError as e:
raise ImportError(
"Could not import presidio_analyzer, please install with "
"`pip install presidio-analyzer`. You will also need to download a "
"spaCy model to use the analyzer, e.g. "
"`python -m spacy download en_core_web_lg`."
) from e
try:
from presidio_anonymizer import AnonymizerEngine
from presidio_anonymizer.entities import OperatorConfig
except ImportError as e:
raise ImportError(
"Could not import presidio_anonymizer, please install with "
"`pip install presidio-anonymizer`."
) from e
self.analyzed_fields = (
analyzed_fields
if analyzed_fields is not None
@ -59,13 +76,41 @@ class PresidioAnonymizer(AnonymizerBase):
field: OperatorConfig(
operator_name="custom", params={"lambda": faker_function}
)
for field, faker_function in get_pseudoanonymizer_mapping().items()
for field, faker_function in get_pseudoanonymizer_mapping(
faker_seed
).items()
}
)
self._analyzer = AnalyzerEngine()
self._anonymizer = AnonymizerEngine()
def add_recognizer(self, recognizer: EntityRecognizer) -> None:
"""Add a recognizer to the analyzer
Args:
recognizer: Recognizer to add to the analyzer.
"""
self._analyzer.registry.add_recognizer(recognizer)
self.analyzed_fields.extend(recognizer.supported_entities)
def add_operators(self, operators: Dict[str, OperatorConfig]) -> None:
"""Add operators to the anonymizer
Args:
operators: Operators to add to the anonymizer.
"""
self.operators.update(operators)
class PresidioAnonymizer(PresidioAnonymizerBase):
def _anonymize(self, text: str) -> str:
"""Anonymize text.
Each PII entity is replaced with a fake value.
Each time fake values will be different, as they are generated randomly.
Args:
text: text to anonymize
"""
results = self._analyzer.analyze(
text,
entities=self.analyzed_fields,
@ -78,11 +123,185 @@ class PresidioAnonymizer(AnonymizerBase):
operators=self.operators,
).text
def add_recognizer(self, recognizer: EntityRecognizer) -> None:
"""Add a recognizer to the analyzer"""
self._analyzer.registry.add_recognizer(recognizer)
self.analyzed_fields.extend(recognizer.supported_entities)
def add_operators(self, operators: Dict[str, OperatorConfig]) -> None:
"""Add operators to the anonymizer"""
self.operators.update(operators)
class PresidioReversibleAnonymizer(PresidioAnonymizerBase, ReversibleAnonymizerBase):
def __init__(
self,
analyzed_fields: Optional[List[str]] = None,
operators: Optional[Dict[str, OperatorConfig]] = None,
faker_seed: Optional[int] = None,
):
super().__init__(analyzed_fields, operators, faker_seed)
self._deanonymizer_mapping = DeanonymizerMapping()
@property
def deanonymizer_mapping(self) -> MappingDataType:
"""Return the deanonymizer mapping"""
return self._deanonymizer_mapping.data
def _update_deanonymizer_mapping(
self,
original_text: str,
analyzer_results: List[RecognizerResult],
anonymizer_results: EngineResult,
) -> None:
"""Creates or updates the mapping used to de-anonymize text.
This method exploits the results returned by the
analysis and anonymization processes.
It constructs a mapping from each anonymized entity
back to its original text value.
Mapping will be stored as "deanonymizer_mapping" property.
Example of "deanonymizer_mapping":
{
"PERSON": {
"<anonymized>": "<original>",
"John Doe": "Slim Shady"
},
"PHONE_NUMBER": {
"111-111-1111": "555-555-5555"
}
...
}
"""
# We are able to zip and loop through both lists because we expect
# them to return corresponding entities for each identified piece
# of analyzable data from our input.
# We sort them by their 'start' attribute because it allows us to
# match corresponding entities by their position in the input text.
analyzer_results = sorted(analyzer_results, key=lambda d: d.start)
anonymizer_results.items = sorted(
anonymizer_results.items, key=lambda d: d.start
)
new_deanonymizer_mapping: MappingDataType = defaultdict(dict)
for analyzed_entity, anonymized_entity in zip(
analyzer_results, anonymizer_results.items
):
original_value = original_text[analyzed_entity.start : analyzed_entity.end]
new_deanonymizer_mapping[anonymized_entity.entity_type][
anonymized_entity.text
] = original_value
self._deanonymizer_mapping.update(new_deanonymizer_mapping)
def _anonymize(self, text: str) -> str:
"""Anonymize text.
Each PII entity is replaced with a fake value.
Each time fake values will be different, as they are generated randomly.
At the same time, we will create a mapping from each anonymized entity
back to its original text value.
Args:
text: text to anonymize
"""
analyzer_results = self._analyzer.analyze(
text,
entities=self.analyzed_fields,
language="en",
)
filtered_analyzer_results = (
self._anonymizer._remove_conflicts_and_get_text_manipulation_data(
analyzer_results
)
)
anonymizer_results = self._anonymizer.anonymize(
text,
analyzer_results=analyzer_results,
operators=self.operators,
)
self._update_deanonymizer_mapping(
text, filtered_analyzer_results, anonymizer_results
)
return anonymizer_results.text
def _deanonymize(
self,
text_to_deanonymize: str,
deanonymizer_matching_strategy: Callable[
[str, MappingDataType], str
] = default_matching_strategy,
) -> str:
"""Deanonymize text.
Each anonymized entity is replaced with its original value.
This method exploits the mapping created during the anonymization process.
Args:
text_to_deanonymize: text to deanonymize
deanonymizer_matching_strategy: function to use to match
anonymized entities with their original values and replace them.
"""
if not self._deanonymizer_mapping:
raise ValueError(
"Deanonymizer mapping is empty.",
"Please call anonymize() and anonymize some text first.",
)
text_to_deanonymize = deanonymizer_matching_strategy(
text_to_deanonymize, self.deanonymizer_mapping
)
return text_to_deanonymize
def save_deanonymizer_mapping(self, file_path: Union[Path, str]) -> None:
"""Save the deanonymizer mapping to a JSON or YAML file.
Args:
file_path: Path to file to save the mapping to.
Example:
.. code-block:: python
anonymizer.save_deanonymizer_mapping(file_path="path/mapping.json")
"""
save_path = Path(file_path)
if save_path.suffix not in [".json", ".yaml"]:
raise ValueError(f"{save_path} must have an extension of .json or .yaml")
# Make sure parent directories exist
save_path.parent.mkdir(parents=True, exist_ok=True)
if save_path.suffix == ".json":
with open(save_path, "w") as f:
json.dump(self.deanonymizer_mapping, f, indent=2)
elif save_path.suffix == ".yaml":
with open(save_path, "w") as f:
yaml.dump(self.deanonymizer_mapping, f, default_flow_style=False)
def load_deanonymizer_mapping(self, file_path: Union[Path, str]) -> None:
"""Load the deanonymizer mapping from a JSON or YAML file.
Args:
file_path: Path to file to load the mapping from.
Example:
.. code-block:: python
anonymizer.load_deanonymizer_mapping(file_path="path/mapping.json")
"""
load_path = Path(file_path)
if load_path.suffix not in [".json", ".yaml"]:
raise ValueError(f"{load_path} must have an extension of .json or .yaml")
if load_path.suffix == ".json":
with open(load_path, "r") as f:
loaded_mapping = json.load(f)
elif load_path.suffix == ".yaml":
with open(load_path, "r") as f:
loaded_mapping = yaml.load(f, Loader=yaml.FullLoader)
self._deanonymizer_mapping.update(loaded_mapping)

@ -0,0 +1,5 @@
from langchain_experimental.graph_transformers.diffbot import DiffbotGraphTransformer
__all__ = [
"DiffbotGraphTransformer",
]

@ -0,0 +1,316 @@
from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
import requests
from langchain.graphs.graph_document import GraphDocument, Node, Relationship
from langchain.schema import Document
from langchain.utils import get_from_env
def format_property_key(s: str) -> str:
words = s.split()
if not words:
return s
first_word = words[0].lower()
capitalized_words = [word.capitalize() for word in words[1:]]
return "".join([first_word] + capitalized_words)
class NodesList:
"""
Manages a list of nodes with associated properties.
Attributes:
nodes (Dict[Tuple, Any]): Stores nodes as keys and their properties as values.
Each key is a tuple where the first element is the
node ID and the second is the node type.
"""
def __init__(self) -> None:
self.nodes: Dict[Tuple[Union[str, int], str], Any] = dict()
def add_node_property(
self, node: Tuple[Union[str, int], str], properties: Dict[str, Any]
) -> None:
"""
Adds or updates node properties.
If the node does not exist in the list, it's added along with its properties.
If the node already exists, its properties are updated with the new values.
Args:
node (Tuple): A tuple containing the node ID and node type.
properties (Dict): A dictionary of properties to add or update for the node.
"""
if node not in self.nodes:
self.nodes[node] = properties
else:
self.nodes[node].update(properties)
def return_node_list(self) -> List[Node]:
"""
Returns the nodes as a list of Node objects.
Each Node object will have its ID, type, and properties populated.
Returns:
List[Node]: A list of Node objects.
"""
nodes = [
Node(id=key[0], type=key[1], properties=self.nodes[key])
for key in self.nodes
]
return nodes
# Properties that should be treated as node properties instead of relationships
FACT_TO_PROPERTY_TYPE = [
"Date",
"Number",
"Job title",
"Cause of death",
"Organization type",
"Academic title",
]
schema_mapping = [
("HEADQUARTERS", "ORGANIZATION_LOCATIONS"),
("RESIDENCE", "PERSON_LOCATION"),
("ALL_PERSON_LOCATIONS", "PERSON_LOCATION"),
("CHILD", "HAS_CHILD"),
("PARENT", "HAS_PARENT"),
("CUSTOMERS", "HAS_CUSTOMER"),
("SKILLED_AT", "INTERESTED_IN"),
]
class SimplifiedSchema:
"""
Provides functionality for working with a simplified schema mapping.
Attributes:
schema (Dict): A dictionary containing the mapping to simplified schema types.
"""
def __init__(self) -> None:
"""Initializes the schema dictionary based on the predefined list."""
self.schema = dict()
for row in schema_mapping:
self.schema[row[0]] = row[1]
def get_type(self, type: str) -> str:
"""
Retrieves the simplified schema type for a given original type.
Args:
type (str): The original schema type to find the simplified type for.
Returns:
str: The simplified schema type if it exists;
otherwise, returns the original type.
"""
try:
return self.schema[type]
except KeyError:
return type
class DiffbotGraphTransformer:
"""Transforms documents into graph documents using Diffbot's NLP API.
A graph document transformation system takes a sequence of Documents and returns a
sequence of Graph Documents.
Example:
.. code-block:: python
class DiffbotGraphTransformer(BaseGraphDocumentTransformer):
def transform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[GraphDocument]:
results = []
for document in documents:
raw_results = self.nlp_request(document.page_content)
graph_document = self.process_response(raw_results, document)
results.append(graph_document)
return results
async def atransform_documents(
self, documents: Sequence[Document], **kwargs: Any
) -> Sequence[Document]:
raise NotImplementedError
"""
def __init__(
self,
diffbot_api_key: Optional[str] = None,
fact_confidence_threshold: float = 0.7,
include_qualifiers: bool = True,
include_evidence: bool = True,
simplified_schema: bool = True,
) -> None:
"""
Initialize the graph transformer with various options.
Args:
diffbot_api_key (str):
The API key for Diffbot's NLP services.
fact_confidence_threshold (float):
Minimum confidence level for facts to be included.
include_qualifiers (bool):
Whether to include qualifiers in the relationships.
include_evidence (bool):
Whether to include evidence for the relationships.
simplified_schema (bool):
Whether to use a simplified schema for relationships.
"""
self.diffbot_api_key = diffbot_api_key or get_from_env(
"diffbot_api_key", "DIFFBOT_API_KEY"
)
self.fact_threshold_confidence = fact_confidence_threshold
self.include_qualifiers = include_qualifiers
self.include_evidence = include_evidence
self.simplified_schema = None
if simplified_schema:
self.simplified_schema = SimplifiedSchema()
def nlp_request(self, text: str) -> Dict[str, Any]:
"""
Make an API request to the Diffbot NLP endpoint.
Args:
text (str): The text to be processed.
Returns:
Dict[str, Any]: The JSON response from the API.
"""
# Relationship extraction only works for English
payload = {
"content": text,
"lang": "en",
}
FIELDS = "facts"
HOST = "nl.diffbot.com"
url = (
f"https://{HOST}/v1/?fields={FIELDS}&"
f"token={self.diffbot_api_key}&language=en"
)
result = requests.post(url, data=payload)
return result.json()
def process_response(
self, payload: Dict[str, Any], document: Document
) -> GraphDocument:
"""
Transform the Diffbot NLP response into a GraphDocument.
Args:
payload (Dict[str, Any]): The JSON response from Diffbot's NLP API.
document (Document): The original document.
Returns:
GraphDocument: The transformed document as a graph.
"""
# Return empty result if there are no facts
if "facts" not in payload or not payload["facts"]:
return GraphDocument(nodes=[], relationships=[], source=document)
# Nodes are a custom class because we need to deduplicate
nodes_list = NodesList()
# Relationships are a list because we don't deduplicate nor anything else
relationships = list()
for record in payload["facts"]:
# Skip if the fact is below the threshold confidence
if record["confidence"] < self.fact_threshold_confidence:
continue
# TODO: It should probably be treated as a node property
if not record["value"]["allTypes"]:
continue
# Define source node
source_id = (
record["entity"]["allUris"][0]
if record["entity"]["allUris"]
else record["entity"]["name"]
)
source_label = record["entity"]["allTypes"][0]["name"].capitalize()
source_name = record["entity"]["name"]
source_node = Node(id=source_id, type=source_label)
nodes_list.add_node_property(
(source_id, source_label), {"name": source_name}
)
# Define target node
target_id = (
record["value"]["allUris"][0]
if record["value"]["allUris"]
else record["value"]["name"]
)
target_label = record["value"]["allTypes"][0]["name"].capitalize()
target_name = record["value"]["name"]
# Some facts are better suited as node properties
if target_label in FACT_TO_PROPERTY_TYPE:
nodes_list.add_node_property(
(source_id, source_label),
{format_property_key(record["property"]["name"]): target_name},
)
else: # Define relationship
# Define target node object
target_node = Node(id=target_id, type=target_label)
nodes_list.add_node_property(
(target_id, target_label), {"name": target_name}
)
# Define relationship type
rel_type = record["property"]["name"].replace(" ", "_").upper()
if self.simplified_schema:
rel_type = self.simplified_schema.get_type(rel_type)
# Relationship qualifiers/properties
rel_properties = dict()
relationship_evidence = [el["passage"] for el in record["evidence"]][0]
if self.include_evidence:
rel_properties.update({"evidence": relationship_evidence})
if self.include_qualifiers and record.get("qualifiers"):
for property in record["qualifiers"]:
prop_key = format_property_key(property["property"]["name"])
rel_properties[prop_key] = property["value"]["name"]
relationship = Relationship(
source=source_node,
target=target_node,
type=rel_type,
properties=rel_properties,
)
relationships.append(relationship)
return GraphDocument(
nodes=nodes_list.return_node_list(),
relationships=relationships,
source=document,
)
def convert_to_graph_documents(
self, documents: Sequence[Document]
) -> List[GraphDocument]:
"""Convert a sequence of documents into graph documents.
Args:
documents (Sequence[Document]): The original documents.
**kwargs: Additional keyword arguments.
Returns:
Sequence[GraphDocument]: The transformed documents as graphs.
"""
results = []
for document in documents:
raw_results = self.nlp_request(document.page_content)
graph_document = self.process_response(raw_results, document)
results.append(graph_document)
return results

@ -0,0 +1,38 @@
"""Vector SQL Database Chain Retriever"""
from typing import Any, Dict, List
from langchain.callbacks.manager import (
AsyncCallbackManagerForRetrieverRun,
CallbackManagerForRetrieverRun,
)
from langchain.schema import BaseRetriever, Document
from langchain_experimental.sql.vector_sql import VectorSQLDatabaseChain
class VectorSQLDatabaseChainRetriever(BaseRetriever):
"""Retriever that uses SQLDatabase as Retriever"""
sql_db_chain: VectorSQLDatabaseChain
"""SQL Database Chain"""
page_content_key: str = "content"
"""column name for page content of documents"""
def _get_relevant_documents(
self,
query: str,
*,
run_manager: CallbackManagerForRetrieverRun,
**kwargs: Any,
) -> List[Document]:
ret: List[Dict[str, Any]] = self.sql_db_chain(
query, callbacks=run_manager.get_child(), **kwargs
)["result"]
return [
Document(page_content=r[self.page_content_key], metadata=r) for r in ret
]
async def _aget_relevant_documents(
self, query: str, *, run_manager: AsyncCallbackManagerForRetrieverRun
) -> List[Document]:
raise NotImplementedError

@ -0,0 +1,85 @@
# flake8: noqa
from langchain.prompts.prompt import PromptTemplate
PROMPT_SUFFIX = """Only use the following tables:
{table_info}
Question: {input}"""
_VECTOR_SQL_DEFAULT_TEMPLATE = """You are a {dialect} expert. Given an input question, first create a syntactically correct {dialect} query to run, then look at the results of the query and return the answer to the input question.
{dialect} queries has a vector distance function called `DISTANCE(column, array)` to compute relevance to the user's question and sort the feature array column by the relevance.
When the query is asking for {top_k} closest row, you have to use this distance function to calculate distance to entity's array on vector column and order by the distance to retrieve relevant rows.
*NOTICE*: `DISTANCE(column, array)` only accept an array column as its first argument and a `NeuralArray(entity)` as its second argument. You also need a user defined function called `NeuralArray(entity)` to retrieve the entity's array.
Unless the user specifies in the question a specific number of examples to obtain, query for at most {top_k} results using the LIMIT clause as per {dialect}. You should only order according to the distance function.
Never query for all columns from a table. You must query only the columns that are needed to answer the question. Wrap each column name in double quotes (") to denote them as delimited identifiers.
Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.
Pay attention to use today() function to get the current date, if the question involves "today". `ORDER BY` clause should always be after `WHERE` clause. DO NOT add semicolon to the end of SQL. Pay attention to the comment in table schema.
Use the following format:
Question: "Question here"
SQLQuery: "SQL Query to run"
SQLResult: "Result of the SQLQuery"
Answer: "Final answer here"
"""
VECTOR_SQL_PROMPT = PromptTemplate(
input_variables=["input", "table_info", "dialect", "top_k"],
template=_VECTOR_SQL_DEFAULT_TEMPLATE + PROMPT_SUFFIX,
)
_myscale_prompt = """You are a MyScale expert. Given an input question, first create a syntactically correct MyScale query to run, then look at the results of the query and return the answer to the input question.
MyScale queries has a vector distance function called `DISTANCE(column, array)` to compute relevance to the user's question and sort the feature array column by the relevance.
When the query is asking for {top_k} closest row, you have to use this distance function to calculate distance to entity's array on vector column and order by the distance to retrieve relevant rows.
*NOTICE*: `DISTANCE(column, array)` only accept an array column as its first argument and a `NeuralArray(entity)` as its second argument. You also need a user defined function called `NeuralArray(entity)` to retrieve the entity's array.
Unless the user specifies in the question a specific number of examples to obtain, query for at most {top_k} results using the LIMIT clause as per MyScale. You should only order according to the distance function.
Never query for all columns from a table. You must query only the columns that are needed to answer the question. Wrap each column name in double quotes (") to denote them as delimited identifiers.
Pay attention to use only the column names you can see in the tables below. Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.
Pay attention to use today() function to get the current date, if the question involves "today". `ORDER BY` clause should always be after `WHERE` clause. DO NOT add semicolon to the end of SQL. Pay attention to the comment in table schema.
Use the following format:
======== table info ========
<some table infos>
Question: "Question here"
SQLQuery: "SQL Query to run"
Here are some examples:
======== table info ========
CREATE TABLE "ChatPaper" (
abstract String,
id String,
vector Array(Float32),
) ENGINE = ReplicatedReplacingMergeTree()
ORDER BY id
PRIMARY KEY id
Question: What is Feartue Pyramid Network?
SQLQuery: SELECT ChatPaper.title, ChatPaper.id, ChatPaper.authors FROM ChatPaper ORDER BY DISTANCE(vector, NeuralArray(PaperRank contribution)) LIMIT {top_k}
Let's begin:
======== table info ========
{table_info}
Question: {input}
SQLQuery: """
MYSCALE_PROMPT = PromptTemplate(
input_variables=["input", "table_info", "top_k"],
template=_myscale_prompt + PROMPT_SUFFIX,
)
VECTOR_SQL_PROMPTS = {
"myscale": MYSCALE_PROMPT,
}

@ -0,0 +1,237 @@
"""Vector SQL Database Chain Retriever"""
from __future__ import annotations
from typing import Any, Dict, List, Optional, Union
from langchain.callbacks.manager import CallbackManagerForChainRun
from langchain.chains.llm import LLMChain
from langchain.chains.sql_database.prompt import PROMPT, SQL_PROMPTS
from langchain.embeddings.base import Embeddings
from langchain.prompts.prompt import PromptTemplate
from langchain.schema import BaseOutputParser, BasePromptTemplate
from langchain.schema.language_model import BaseLanguageModel
from langchain.tools.sql_database.prompt import QUERY_CHECKER
from langchain.utilities.sql_database import SQLDatabase
from langchain_experimental.sql.base import INTERMEDIATE_STEPS_KEY, SQLDatabaseChain
class VectorSQLOutputParser(BaseOutputParser[str]):
"""Output Parser for Vector SQL
1. finds for `NeuralArray()` and replace it with the embedding
2. finds for `DISTANCE()` and replace it with the distance name in backend SQL
"""
model: Embeddings
"""Embedding model to extract embedding for entity"""
distance_func_name: str = "distance"
"""Distance name for Vector SQL"""
class Config:
arbitrary_types_allowed = 1
@property
def _type(self) -> str:
return "vector_sql_parser"
@classmethod
def from_embeddings(
cls, model: Embeddings, distance_func_name: str = "distance", **kwargs: Any
) -> BaseOutputParser:
return cls(model=model, distance_func_name=distance_func_name, **kwargs)
def parse(self, text: str) -> str:
text = text.strip()
start = text.find("NeuralArray(")
_sql_str_compl = text
if start > 0:
_matched = text[text.find("NeuralArray(") + len("NeuralArray(") :]
end = _matched.find(")") + start + len("NeuralArray(") + 1
entity = _matched[: _matched.find(")")]
vecs = self.model.embed_query(entity)
vecs_str = "[" + ",".join(map(str, vecs)) + "]"
_sql_str_compl = text.replace("DISTANCE", self.distance_func_name).replace(
text[start:end], vecs_str
)
if _sql_str_compl[-1] == ";":
_sql_str_compl = _sql_str_compl[:-1]
return _sql_str_compl
class VectorSQLRetrieveAllOutputParser(VectorSQLOutputParser):
"""Based on VectorSQLOutputParser
It also modify the SQL to get all columns
"""
@property
def _type(self) -> str:
return "vector_sql_retrieve_all_parser"
def parse(self, text: str) -> str:
text = text.strip()
start = text.upper().find("SELECT")
if start >= 0:
end = text.upper().find("FROM")
text = text.replace(text[start + len("SELECT") + 1 : end - 1], "*")
return super().parse(text)
def _try_eval(x: Any) -> Any:
try:
return eval(x)
except Exception:
return x
def get_result_from_sqldb(
db: SQLDatabase, cmd: str
) -> Union[str, List[Dict[str, Any]], Dict[str, Any]]:
result = db._execute(cmd, fetch="all") # type: ignore
if isinstance(result, list):
return [{k: _try_eval(v) for k, v in dict(d._asdict()).items()} for d in result]
else:
return {
k: _try_eval(v) for k, v in dict(result._asdict()).items() # type: ignore
}
class VectorSQLDatabaseChain(SQLDatabaseChain):
"""Chain for interacting with Vector SQL Database.
Example:
.. code-block:: python
from langchain_experimental.sql import SQLDatabaseChain
from langchain import OpenAI, SQLDatabase, OpenAIEmbeddings
db = SQLDatabase(...)
db_chain = VectorSQLDatabaseChain.from_llm(OpenAI(), db, OpenAIEmbeddings())
*Security note*: Make sure that the database connection uses credentials
that are narrowly-scoped to only include the permissions this chain needs.
Failure to do so may result in data corruption or loss, since this chain may
attempt commands like `DROP TABLE` or `INSERT` if appropriately prompted.
The best way to guard against such negative outcomes is to (as appropriate)
limit the permissions granted to the credentials used with this chain.
This issue shows an example negative outcome if these steps are not taken:
https://github.com/langchain-ai/langchain/issues/5923
"""
sql_cmd_parser: VectorSQLOutputParser
"""Parser for Vector SQL"""
native_format: bool = False
"""If return_direct, controls whether to return in python native format"""
def _call(
self,
inputs: Dict[str, Any],
run_manager: Optional[CallbackManagerForChainRun] = None,
) -> Dict[str, Any]:
_run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
input_text = f"{inputs[self.input_key]}\nSQLQuery:"
_run_manager.on_text(input_text, verbose=self.verbose)
# If not present, then defaults to None which is all tables.
table_names_to_use = inputs.get("table_names_to_use")
table_info = self.database.get_table_info(table_names=table_names_to_use)
llm_inputs = {
"input": input_text,
"top_k": str(self.top_k),
"dialect": self.database.dialect,
"table_info": table_info,
"stop": ["\nSQLResult:"],
}
intermediate_steps: List = []
try:
intermediate_steps.append(llm_inputs) # input: sql generation
llm_out = self.llm_chain.predict(
callbacks=_run_manager.get_child(),
**llm_inputs,
)
sql_cmd = self.sql_cmd_parser.parse(llm_out)
if self.return_sql:
return {self.output_key: sql_cmd}
if not self.use_query_checker:
_run_manager.on_text(llm_out, color="green", verbose=self.verbose)
intermediate_steps.append(
llm_out
) # output: sql generation (no checker)
intermediate_steps.append({"sql_cmd": llm_out}) # input: sql exec
result = get_result_from_sqldb(self.database, sql_cmd)
intermediate_steps.append(str(result)) # output: sql exec
else:
query_checker_prompt = self.query_checker_prompt or PromptTemplate(
template=QUERY_CHECKER, input_variables=["query", "dialect"]
)
query_checker_chain = LLMChain(
llm=self.llm_chain.llm,
prompt=query_checker_prompt,
output_parser=self.llm_chain.output_parser,
)
query_checker_inputs = {
"query": llm_out,
"dialect": self.database.dialect,
}
checked_llm_out = query_checker_chain.predict(
callbacks=_run_manager.get_child(), **query_checker_inputs
)
checked_sql_command = self.sql_cmd_parser.parse(checked_llm_out)
intermediate_steps.append(
checked_llm_out
) # output: sql generation (checker)
_run_manager.on_text(
checked_llm_out, color="green", verbose=self.verbose
)
intermediate_steps.append(
{"sql_cmd": checked_llm_out}
) # input: sql exec
result = get_result_from_sqldb(self.database, checked_sql_command)
intermediate_steps.append(str(result)) # output: sql exec
llm_out = checked_llm_out
sql_cmd = checked_sql_command
_run_manager.on_text("\nSQLResult: ", verbose=self.verbose)
_run_manager.on_text(str(result), color="yellow", verbose=self.verbose)
# If return direct, we just set the final result equal to
# the result of the sql query result, otherwise try to get a human readable
# final answer
if self.return_direct:
final_result = result
else:
_run_manager.on_text("\nAnswer:", verbose=self.verbose)
input_text += f"{llm_out}\nSQLResult: {result}\nAnswer:"
llm_inputs["input"] = input_text
intermediate_steps.append(llm_inputs) # input: final answer
final_result = self.llm_chain.predict(
callbacks=_run_manager.get_child(),
**llm_inputs,
).strip()
intermediate_steps.append(final_result) # output: final answer
_run_manager.on_text(final_result, color="green", verbose=self.verbose)
chain_result: Dict[str, Any] = {self.output_key: final_result}
if self.return_intermediate_steps:
chain_result[INTERMEDIATE_STEPS_KEY] = intermediate_steps
return chain_result
except Exception as exc:
# Append intermediate steps to exception, to aid in logging and later
# improvement of few shot prompt seeds
exc.intermediate_steps = intermediate_steps # type: ignore
raise exc
@property
def _chain_type(self) -> str:
return "vector_sql_database_chain"
@classmethod
def from_llm(
cls,
llm: BaseLanguageModel,
db: SQLDatabase,
prompt: Optional[BasePromptTemplate] = None,
sql_cmd_parser: Optional[VectorSQLOutputParser] = None,
**kwargs: Any,
) -> VectorSQLDatabaseChain:
assert sql_cmd_parser, "`sql_cmd_parser` must be set in VectorSQLDatabaseChain."
prompt = prompt or SQL_PROMPTS.get(db.dialect, PROMPT)
llm_chain = LLMChain(llm=llm, prompt=prompt)
return cls(
llm_chain=llm_chain, database=db, sql_cmd_parser=sql_cmd_parser, **kwargs
)

@ -1245,6 +1245,7 @@ optional = false
python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
files = [
{file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
{file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"},
]
[[package]]
@ -3752,6 +3753,31 @@ files = [
{file = "types_PyYAML-6.0.12.11-py3-none-any.whl", hash = "sha256:a461508f3096d1d5810ec5ab95d7eeecb651f3a15b71959999988942063bf01d"},
]
[[package]]
name = "types-requests"
version = "2.31.0.2"
description = "Typing stubs for requests"
optional = false
python-versions = "*"
files = [
{file = "types-requests-2.31.0.2.tar.gz", hash = "sha256:6aa3f7faf0ea52d728bb18c0a0d1522d9bfd8c72d26ff6f61bfc3d06a411cf40"},
{file = "types_requests-2.31.0.2-py3-none-any.whl", hash = "sha256:56d181c85b5925cbc59f4489a57e72a8b2166f18273fd8ba7b6fe0c0b986f12a"},
]
[package.dependencies]
types-urllib3 = "*"
[[package]]
name = "types-urllib3"
version = "1.26.25.14"
description = "Typing stubs for urllib3"
optional = false
python-versions = "*"
files = [
{file = "types-urllib3-1.26.25.14.tar.gz", hash = "sha256:229b7f577c951b8c1b92c1bc2b2fdb0b49847bd2af6d1cc2a2e3dd340f3bda8f"},
{file = "types_urllib3-1.26.25.14-py3-none-any.whl", hash = "sha256:9683bbb7fb72e32bfe9d2be6e04875fbe1b3eeec3cbb4ea231435aa7fd6b4f0e"},
]
[[package]]
name = "typing-extensions"
version = "4.7.1"
@ -3995,4 +4021,4 @@ extended-testing = ["faker", "presidio-analyzer", "presidio-anonymizer"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "66ac482bd05eb74414210ac28fc1e8dae1a9928a4a1314e1326fada3551aa8ad"
content-hash = "443e88f690572715cf58671e4480a006574c7141a1258dff0a0818b954184901"

@ -1,6 +1,6 @@
[tool.poetry]
name = "langchain-experimental"
version = "0.0.13"
version = "0.0.15"
description = "Building applications with LLMs through composability"
authors = []
license = "MIT"
@ -23,6 +23,7 @@ black = "^23.1.0"
[tool.poetry.group.typing.dependencies]
mypy = "^0.991"
types-pyyaml = "^6.0.12.2"
types-requests = "^2.28.11.5"
[tool.poetry.group.dev.dependencies]
jupyter = "^1.0.0"

@ -0,0 +1,154 @@
import os
from typing import Iterator, List
import pytest
@pytest.fixture(scope="module", autouse=True)
def check_spacy_model() -> Iterator[None]:
import spacy
if not spacy.util.is_package("en_core_web_lg"):
pytest.skip(reason="Spacy model 'en_core_web_lg' not installed")
yield
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
@pytest.mark.parametrize(
"analyzed_fields,should_contain",
[(["PERSON"], False), (["PHONE_NUMBER"], True), (None, False)],
)
def test_anonymize(analyzed_fields: List[str], should_contain: bool) -> None:
"""Test anonymizing a name in a simple sentence"""
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
text = "Hello, my name is John Doe."
anonymizer = PresidioReversibleAnonymizer(analyzed_fields=analyzed_fields)
anonymized_text = anonymizer.anonymize(text)
assert ("John Doe" in anonymized_text) == should_contain
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_anonymize_multiple() -> None:
"""Test anonymizing multiple items in a sentence"""
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
text = "John Smith's phone number is 313-666-7440 and email is johnsmith@gmail.com"
anonymizer = PresidioReversibleAnonymizer()
anonymized_text = anonymizer.anonymize(text)
for phrase in ["John Smith", "313-666-7440", "johnsmith@gmail.com"]:
assert phrase not in anonymized_text
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_anonymize_with_custom_operator() -> None:
"""Test anonymize a name with a custom operator"""
from presidio_anonymizer.entities import OperatorConfig
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
custom_operator = {"PERSON": OperatorConfig("replace", {"new_value": "<name>"})}
anonymizer = PresidioReversibleAnonymizer(operators=custom_operator)
text = "Jane Doe was here."
anonymized_text = anonymizer.anonymize(text)
assert anonymized_text == "<name> was here."
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_add_recognizer_operator() -> None:
"""
Test add recognizer and anonymize a new type of entity and with a custom operator
"""
from presidio_analyzer import PatternRecognizer
from presidio_anonymizer.entities import OperatorConfig
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
anonymizer = PresidioReversibleAnonymizer(analyzed_fields=[])
titles_list = ["Sir", "Madam", "Professor"]
custom_recognizer = PatternRecognizer(
supported_entity="TITLE", deny_list=titles_list
)
anonymizer.add_recognizer(custom_recognizer)
# anonymizing with custom recognizer
text = "Madam Jane Doe was here."
anonymized_text = anonymizer.anonymize(text)
assert anonymized_text == "<TITLE> Jane Doe was here."
# anonymizing with custom recognizer and operator
custom_operator = {"TITLE": OperatorConfig("replace", {"new_value": "Dear"})}
anonymizer.add_operators(custom_operator)
anonymized_text = anonymizer.anonymize(text)
assert anonymized_text == "Dear Jane Doe was here."
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_deanonymizer_mapping() -> None:
"""Test if deanonymizer mapping is correctly populated"""
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
anonymizer = PresidioReversibleAnonymizer(
analyzed_fields=["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD"]
)
anonymizer.anonymize("Hello, my name is John Doe and my number is 444 555 6666.")
# ["PERSON", "PHONE_NUMBER"]
assert len(anonymizer.deanonymizer_mapping.keys()) == 2
assert "John Doe" in anonymizer.deanonymizer_mapping.get("PERSON", {}).values()
assert (
"444 555 6666"
in anonymizer.deanonymizer_mapping.get("PHONE_NUMBER", {}).values()
)
text_to_anonymize = (
"And my name is Jane Doe, my email is jane@gmail.com and "
"my credit card is 4929 5319 6292 5362."
)
anonymizer.anonymize(text_to_anonymize)
# ["PERSON", "PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD"]
assert len(anonymizer.deanonymizer_mapping.keys()) == 4
assert "Jane Doe" in anonymizer.deanonymizer_mapping.get("PERSON", {}).values()
assert (
"jane@gmail.com"
in anonymizer.deanonymizer_mapping.get("EMAIL_ADDRESS", {}).values()
)
assert (
"4929 5319 6292 5362"
in anonymizer.deanonymizer_mapping.get("CREDIT_CARD", {}).values()
)
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_deanonymize() -> None:
"""Test deanonymizing a name in a simple sentence"""
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
text = "Hello, my name is John Doe."
anonymizer = PresidioReversibleAnonymizer(analyzed_fields=["PERSON"])
anonymized_text = anonymizer.anonymize(text)
deanonymized_text = anonymizer.deanonymize(anonymized_text)
assert deanonymized_text == text
@pytest.mark.requires("presidio_analyzer", "presidio_anonymizer", "faker")
def test_save_load_deanonymizer_mapping() -> None:
from langchain_experimental.data_anonymizer import PresidioReversibleAnonymizer
anonymizer = PresidioReversibleAnonymizer(analyzed_fields=["PERSON"])
anonymizer.anonymize("Hello, my name is John Doe.")
try:
anonymizer.save_deanonymizer_mapping("test_file.json")
assert os.path.isfile("test_file.json")
anonymizer = PresidioReversibleAnonymizer()
anonymizer.load_deanonymizer_mapping("test_file.json")
assert "John Doe" in anonymizer.deanonymizer_mapping.get("PERSON", {}).values()
finally:
os.remove("test_file.json")

@ -1,5 +1,5 @@
"""SQL agent."""
from typing import Any, Dict, List, Optional
from typing import Any, Dict, List, Optional, Sequence
from langchain.agents.agent import AgentExecutor, BaseSingleActionAgent
from langchain.agents.agent_toolkits.sql.prompt import (
@ -21,6 +21,7 @@ from langchain.prompts.chat import (
)
from langchain.schema.language_model import BaseLanguageModel
from langchain.schema.messages import AIMessage, SystemMessage
from langchain.tools import BaseTool
def create_sql_agent(
@ -38,10 +39,11 @@ def create_sql_agent(
early_stopping_method: str = "force",
verbose: bool = False,
agent_executor_kwargs: Optional[Dict[str, Any]] = None,
extra_tools: Sequence[BaseTool] = (),
**kwargs: Dict[str, Any],
) -> AgentExecutor:
"""Construct an SQL agent from an LLM and tools."""
tools = toolkit.get_tools()
tools = toolkit.get_tools() + list(extra_tools)
prefix = prefix.format(dialect=toolkit.dialect, top_k=top_k)
agent: BaseSingleActionAgent

@ -84,17 +84,17 @@ class GraphSparqlQAChain(Chain):
_intent = self.sparql_intent_chain.run({"prompt": prompt}, callbacks=callbacks)
intent = _intent.strip()
if "SELECT" not in intent and "UPDATE" not in intent:
raise ValueError(
"I am sorry, but this prompt seems to fit none of the currently "
"supported SPARQL query types, i.e., SELECT and UPDATE."
)
elif intent.find("SELECT") < intent.find("UPDATE"):
if "SELECT" in intent and "UPDATE" not in intent:
sparql_generation_chain = self.sparql_generation_select_chain
intent = "SELECT"
else:
elif "UPDATE" in intent and "SELECT" not in intent:
sparql_generation_chain = self.sparql_generation_update_chain
intent = "UPDATE"
else:
raise ValueError(
"I am sorry, but this prompt seems to fit none of the currently "
"supported SPARQL query types, i.e., SELECT and UPDATE."
)
_run_manager.on_text("Identified intent:", end="\n", verbose=self.verbose)
_run_manager.on_text(intent, color="green", end="\n", verbose=self.verbose)

@ -1,6 +1,7 @@
import json
import logging
import os
import tempfile
import zipfile
from pathlib import Path
from typing import Iterator, List, Union
@ -136,7 +137,8 @@ class TelegramChatLoader(chat_loaders.BaseChatLoader):
with zipfile.ZipFile(path) as zip_file:
for file in zip_file.namelist():
if file.endswith((".html", ".json")):
yield zip_file.extract(file)
with tempfile.TemporaryDirectory() as temp_dir:
yield zip_file.extract(file, path=temp_dir)
def lazy_load(self) -> Iterator[chat_loaders.ChatSession]:
"""Lazy load the messages from the chat file and yield them

@ -1,7 +1,8 @@
import asyncio
import logging
import warnings
from typing import Any, Dict, Iterator, List, Optional, Union
from concurrent.futures import ThreadPoolExecutor
from typing import Any, Dict, Iterator, List, Optional, Union, cast
import aiohttp
import requests
@ -129,9 +130,18 @@ class AsyncHtmlLoader(BaseLoader):
def load(self) -> List[Document]:
"""Load text from the url(s) in web_path."""
results = asyncio.run(self.fetch_all(self.web_paths))
try:
# Raises RuntimeError if there is no current event loop.
asyncio.get_running_loop()
# If there is a current event loop, we need to run the async code
# in a separate loop, in a separate thread.
with ThreadPoolExecutor(max_workers=1) as executor:
future = executor.submit(asyncio.run, self.fetch_all(self.web_paths))
results = future.result()
except RuntimeError:
results = asyncio.run(self.fetch_all(self.web_paths))
docs = []
for i, text in enumerate(results):
for i, text in enumerate(cast(List[str], results)):
metadata = {"source": self.web_paths[i]}
docs.append(Document(page_content=text, metadata=metadata))

@ -1,11 +1,16 @@
"""Module contains common parsers for PDFs."""
from typing import Any, Iterator, Mapping, Optional, Sequence, Union
from __future__ import annotations
from typing import TYPE_CHECKING, Any, Iterator, Mapping, Optional, Sequence, Union
from urllib.parse import urlparse
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders import Blob
from langchain.schema import Document
if TYPE_CHECKING:
import pdfplumber.page
class PyPDFParser(BaseBlobParser):
"""Load `PDF` using `pypdf` and chunk at character level."""
@ -116,13 +121,17 @@ class PyPDFium2Parser(BaseBlobParser):
class PDFPlumberParser(BaseBlobParser):
"""Parse `PDF` with `PDFPlumber`."""
def __init__(self, text_kwargs: Optional[Mapping[str, Any]] = None) -> None:
def __init__(
self, text_kwargs: Optional[Mapping[str, Any]] = None, dedupe: bool = False
) -> None:
"""Initialize the parser.
Args:
text_kwargs: Keyword arguments to pass to ``pdfplumber.Page.extract_text()``
dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
"""
self.text_kwargs = text_kwargs or {}
self.dedupe = dedupe
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""Lazily parse the blob."""
@ -133,7 +142,7 @@ class PDFPlumberParser(BaseBlobParser):
yield from [
Document(
page_content=page.extract_text(**self.text_kwargs),
page_content=self._process_page_content(page),
metadata=dict(
{
"source": blob.source,
@ -151,6 +160,12 @@ class PDFPlumberParser(BaseBlobParser):
for page in doc.pages
]
def _process_page_content(self, page: pdfplumber.page.Page) -> str:
"""Process the page content based on dedupe."""
if self.dedupe:
return page.dedupe_chars().extract_text(**self.text_kwargs)
return page.extract_text(**self.text_kwargs)
class AmazonTextractPDFParser(BaseBlobParser):
"""Send `PDF` files to `Amazon Textract` and parse them.

@ -437,7 +437,10 @@ class PDFPlumberLoader(BasePDFLoader):
"""Load `PDF` files using `pdfplumber`."""
def __init__(
self, file_path: str, text_kwargs: Optional[Mapping[str, Any]] = None
self,
file_path: str,
text_kwargs: Optional[Mapping[str, Any]] = None,
dedupe: bool = False,
) -> None:
"""Initialize with a file path."""
try:
@ -450,11 +453,12 @@ class PDFPlumberLoader(BasePDFLoader):
super().__init__(file_path)
self.text_kwargs = text_kwargs or {}
self.dedupe = dedupe
def load(self) -> List[Document]:
"""Load file."""
parser = PDFPlumberParser(text_kwargs=self.text_kwargs)
parser = PDFPlumberParser(text_kwargs=self.text_kwargs, dedupe=self.dedupe)
blob = Blob.from_path(self.file_path)
return parser.parse(blob)

@ -114,7 +114,7 @@ class S3DirectoryLoader(BaseLoader):
aws_access_key_id=self.aws_access_key_id,
aws_secret_access_key=self.aws_secret_access_key,
aws_session_token=self.aws_session_token,
boto_config=self.boto_config,
config=self.boto_config,
)
bucket = s3.Bucket(self.bucket)
docs = []

@ -8,7 +8,9 @@ from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
if TYPE_CHECKING:
from playwright.async_api import AsyncBrowser, AsyncPage, AsyncResponse
from playwright.async_api import Browser as AsyncBrowser
from playwright.async_api import Page as AsyncPage
from playwright.async_api import Response as AsyncResponse
from playwright.sync_api import Browser, Page, Response
@ -155,6 +157,9 @@ class PlaywrightURLLoader(BaseLoader):
try:
page = browser.new_page()
response = page.goto(url)
if response is None:
raise ValueError(f"page.goto() returned None for url {url}")
text = self.evaluator.evaluate(page, browser, response)
metadata = {"source": url}
docs.append(Document(page_content=text, metadata=metadata))
@ -185,6 +190,9 @@ class PlaywrightURLLoader(BaseLoader):
try:
page = await browser.new_page()
response = await page.goto(url)
if response is None:
raise ValueError(f"page.goto() returned None for url {url}")
text = await self.evaluator.evaluate_async(page, browser, response)
metadata = {"source": url}
docs.append(Document(page_content=text, metadata=metadata))

@ -35,6 +35,7 @@ from langchain.embeddings.gpt4all import GPT4AllEmbeddings
from langchain.embeddings.huggingface import (
HuggingFaceBgeEmbeddings,
HuggingFaceEmbeddings,
HuggingFaceInferenceAPIEmbeddings,
HuggingFaceInstructEmbeddings,
)
from langchain.embeddings.huggingface_hub import HuggingFaceHubEmbeddings
@ -69,6 +70,7 @@ __all__ = [
"CohereEmbeddings",
"ElasticsearchEmbeddings",
"HuggingFaceEmbeddings",
"HuggingFaceInferenceAPIEmbeddings",
"JinaEmbeddings",
"LlamaCppEmbeddings",
"HuggingFaceHubEmbeddings",

@ -1,5 +1,7 @@
from typing import Any, Dict, List, Optional
import requests
from langchain.embeddings.base import Embeddings
from langchain.pydantic_v1 import BaseModel, Extra, Field
@ -58,7 +60,7 @@ class HuggingFaceEmbeddings(BaseModel, Embeddings):
except ImportError as exc:
raise ImportError(
"Could not import sentence_transformers python package. "
"Please install it with `pip install sentence_transformers`."
"Please install it with `pip install sentence-transformers`."
) from exc
self.client = sentence_transformers.SentenceTransformer(
@ -266,3 +268,71 @@ class HuggingFaceBgeEmbeddings(BaseModel, Embeddings):
self.query_instruction + text, **self.encode_kwargs
)
return embedding.tolist()
class HuggingFaceInferenceAPIEmbeddings(BaseModel, Embeddings):
"""Embed texts using the HuggingFace API.
Requires a HuggingFace Inference API key and a model name.
"""
api_key: str
"""Your API key for the HuggingFace Inference API."""
model_name: str = "sentence-transformers/all-MiniLM-L6-v2"
"""The name of the model to use for text embeddings."""
@property
def _api_url(self) -> str:
return (
"https://api-inference.huggingface.co"
"/pipeline"
"/feature-extraction"
f"/{self.model_name}"
)
@property
def _headers(self) -> dict:
return {"Authorization": f"Bearer {self.api_key}"}
def embed_documents(self, texts: List[str]) -> List[List[float]]:
"""Get the embeddings for a list of texts.
Args:
texts (Documents): A list of texts to get embeddings for.
Returns:
Embedded texts as List[List[float]], where each inner List[float]
corresponds to a single input text.
Example:
.. code-block:: python
from langchain.embeddings import HuggingFaceInferenceAPIEmbeddings
hf_embeddings = HuggingFaceInferenceAPIEmbeddings(
api_key="your_api_key",
model_name="sentence-transformers/all-MiniLM-l6-v2"
)
texts = ["Hello, world!", "How are you?"]
hf_embeddings.embed_documents(texts)
"""
response = requests.post(
self._api_url,
headers=self._headers,
json={
"inputs": texts,
"options": {"wait_for_model": True, "use_cache": True},
},
)
return response.json()
def embed_query(self, text: str) -> List[float]:
"""Compute query embeddings using a HuggingFace transformer model.
Args:
text: The text to embed.
Returns:
Embeddings for the text.
"""
return self.embed_documents([text])[0]

@ -87,8 +87,8 @@ def _async_retry_decorator(embeddings: OpenAIEmbeddings) -> Any:
# https://stackoverflow.com/questions/76469415/getting-embeddings-of-length-1-from-langchain-openaiembeddings
def _check_response(response: dict) -> dict:
if any(len(d["embedding"]) == 1 for d in response["data"]):
def _check_response(response: dict, skip_empty: bool = False) -> dict:
if any(len(d["embedding"]) == 1 for d in response["data"]) and not skip_empty:
import openai
raise openai.error.APIError("OpenAI API returned an empty embedding")
@ -102,7 +102,7 @@ def embed_with_retry(embeddings: OpenAIEmbeddings, **kwargs: Any) -> Any:
@retry_decorator
def _embed_with_retry(**kwargs: Any) -> Any:
response = embeddings.client.create(**kwargs)
return _check_response(response)
return _check_response(response, skip_empty=embeddings.skip_empty)
return _embed_with_retry(**kwargs)
@ -113,7 +113,7 @@ async def async_embed_with_retry(embeddings: OpenAIEmbeddings, **kwargs: Any) ->
@_async_retry_decorator(embeddings)
async def _async_embed_with_retry(**kwargs: Any) -> Any:
response = await embeddings.client.acreate(**kwargs)
return _check_response(response)
return _check_response(response, skip_empty=embeddings.skip_empty)
return await _async_embed_with_retry(**kwargs)
@ -196,6 +196,9 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
"""Whether to show a progress bar when embedding."""
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Holds any model parameters valid for `create` call not explicitly specified."""
skip_empty: bool = False
"""Whether to skip empty strings when embedding or raise an error.
Defaults to not skipping."""
class Config:
"""Configuration for this pydantic object."""
@ -371,6 +374,8 @@ class OpenAIEmbeddings(BaseModel, Embeddings):
results: List[List[List[float]]] = [[] for _ in range(len(texts))]
num_tokens_in_batch: List[List[int]] = [[] for _ in range(len(texts))]
for i in range(len(indices)):
if self.skip_empty and len(batched_embeddings[i]) == 1:
continue
results[indices[i]].append(batched_embeddings[i])
num_tokens_in_batch[indices[i]].append(len(tokens[i]))

@ -0,0 +1,51 @@
from __future__ import annotations
from typing import List, Union
from langchain.load.serializable import Serializable
from langchain.pydantic_v1 import Field
from langchain.schema import Document
class Node(Serializable):
"""Represents a node in a graph with associated properties.
Attributes:
id (Union[str, int]): A unique identifier for the node.
type (str): The type or label of the node, default is "Node".
properties (dict): Additional properties and metadata associated with the node.
"""
id: Union[str, int]
type: str = "Node"
properties: dict = Field(default_factory=dict)
class Relationship(Serializable):
"""Represents a directed relationship between two nodes in a graph.
Attributes:
source (Node): The source node of the relationship.
target (Node): The target node of the relationship.
type (str): The type of the relationship.
properties (dict): Additional properties associated with the relationship.
"""
source: Node
target: Node
type: str
properties: dict = Field(default_factory=dict)
class GraphDocument(Serializable):
"""Represents a graph document consisting of nodes and relationships.
Attributes:
nodes (List[Node]): A list of nodes in the graph.
relationships (List[Relationship]): A list of relationships in the graph.
source (Document): The document from which the graph information is derived.
"""
nodes: List[Node]
relationships: List[Relationship]
source: Document

@ -1,5 +1,7 @@
from typing import Any, Dict, List
from langchain.graphs.graph_document import GraphDocument
node_properties_query = """
CALL apoc.meta.data()
YIELD label, other, elementType, type, property
@ -99,3 +101,56 @@ class Neo4jGraph:
The relationships are the following:
{[el['output'] for el in relationships]}
"""
def add_graph_documents(
self, graph_documents: List[GraphDocument], include_source: bool = False
) -> None:
"""
Take GraphDocument as input as uses it to construct a graph.
"""
for document in graph_documents:
include_docs_query = (
"CREATE (d:Document) "
"SET d.text = $document.page_content "
"SET d += $document.metadata "
"WITH d "
)
# Import nodes
self.query(
(
f"{include_docs_query if include_source else ''}"
"UNWIND $data AS row "
"CALL apoc.merge.node([row.type], {id: row.id}, "
"row.properties, {}) YIELD node "
f"{'MERGE (d)-[:MENTIONS]->(node) ' if include_source else ''}"
"RETURN distinct 'done' AS result"
),
{
"data": [el.__dict__ for el in document.nodes],
"document": document.source.__dict__,
},
)
# Import relationships
self.query(
"UNWIND $data AS row "
"CALL apoc.merge.node([row.source_label], {id: row.source},"
"{}, {}) YIELD node as source "
"CALL apoc.merge.node([row.target_label], {id: row.target},"
"{}, {}) YIELD node as target "
"CALL apoc.merge.relationship(source, row.type, "
"{}, row.properties, target) YIELD rel "
"RETURN distinct 'done'",
{
"data": [
{
"source": el.source.id,
"source_label": el.source.type,
"target": el.target.id,
"target_label": el.target.type,
"type": el.type.replace(" ", "_").upper(),
"properties": el.properties,
}
for el in document.relationships
]
},
)

@ -15,6 +15,7 @@ class Banana(LLM):
To use, you should have the ``banana-dev`` python package installed,
and the environment variable ``BANANA_API_KEY`` set with your API key.
This is the team API key available in the Banana dashboard.
Any parameters that are valid to be passed to the call can be passed
in, even if not explicitly saved on this class.
@ -23,10 +24,13 @@ class Banana(LLM):
.. code-block:: python
from langchain.llms import Banana
banana = Banana(model_key="")
banana = Banana(model_key="", model_url_slug="")
"""
model_key: str = ""
"""model key to use"""
model_url_slug: str = ""
"""model endpoint to use"""
model_kwargs: Dict[str, Any] = Field(default_factory=dict)
@ -72,6 +76,7 @@ class Banana(LLM):
"""Get the identifying parameters."""
return {
**{"model_key": self.model_key},
**{"model_url_slug": self.model_url_slug},
**{"model_kwargs": self.model_kwargs},
}
@ -89,7 +94,7 @@ class Banana(LLM):
) -> str:
"""Call to Banana endpoint."""
try:
import banana_dev as banana
from banana_dev import Client
except ImportError:
raise ImportError(
"Could not import banana-dev python package. "
@ -99,19 +104,25 @@ class Banana(LLM):
params = {**params, **kwargs}
api_key = self.banana_api_key
model_key = self.model_key
model_url_slug = self.model_url_slug
model_inputs = {
# a json specific to your model.
"prompt": prompt,
**params,
}
response = banana.run(api_key, model_key, model_inputs)
model = Client(
# Found in main dashboard
api_key=api_key,
# Both found in model details page
model_key=model_key,
url=f"https://{model_url_slug}.run.banana.dev",
)
response, meta = model.call("/", model_inputs)
try:
text = response["modelOutputs"][0]["output"]
text = response["outputs"]
except (KeyError, TypeError):
returned = response["modelOutputs"][0]
raise ValueError(
"Response should be of schema: {'output': 'text'}."
f"\nResponse was: {returned}"
"Response should be of schema: {'outputs': 'text'}."
"\nTo fix this:"
"\n- fork the source repo of the Banana model"
"\n- modify app.py to return the above schema"

@ -65,7 +65,7 @@ class HuggingFaceTextGenInference(LLM):
typical_p: Optional[float] = 0.95
"""Typical Decoding mass. See [Typical Decoding for Natural Language
Generation](https://arxiv.org/abs/2202.00666) for more information."""
temperature: float = 0.8
temperature: Optional[float] = 0.8
"""The value used to module the logits distribution."""
repetition_penalty: Optional[float] = None
"""The parameter for repetition penalty. 1.0 means no penalty.

@ -91,7 +91,7 @@ class PipelineAI(LLM, BaseModel):
try:
from pipeline import PipelineCloud
except ImportError:
raise ValueError(
raise ImportError(
"Could not import pipeline-ai python package. "
"Please install it with `pip install pipeline-ai`."
)

@ -121,7 +121,7 @@ class RWKV(LLM, BaseModel):
values["pipeline"] = PIPELINE(values["client"], values["tokens_path"])
except ImportError:
raise ValueError(
raise ImportError(
"Could not import rwkv python package. "
"Please install it with `pip install rwkv`."
)

@ -62,6 +62,10 @@ class VLLM(BaseLLM):
dtype: str = "auto"
"""The data type for the model weights and activations."""
download_dir: Optional[str] = None
"""Directory to download and load the weights. (Default to the default
cache dir of huggingface)"""
vllm_kwargs: Dict[str, Any] = Field(default_factory=dict)
"""Holds any model parameters valid for `vllm.LLM` call not explicitly specified."""
@ -84,6 +88,7 @@ class VLLM(BaseLLM):
tensor_parallel_size=values["tensor_parallel_size"],
trust_remote_code=values["trust_remote_code"],
dtype=values["dtype"],
download_dir=values["download_dir"],
**values["vllm_kwargs"],
)

@ -20,6 +20,7 @@ from langchain.output_parsers.fix import OutputFixingParser
from langchain.output_parsers.list import (
CommaSeparatedListOutputParser,
ListOutputParser,
NumberedListOutputParser,
)
from langchain.output_parsers.pydantic import PydanticOutputParser
from langchain.output_parsers.rail_parser import GuardrailsOutputParser
@ -36,6 +37,7 @@ __all__ = [
"EnumOutputParser",
"GuardrailsOutputParser",
"ListOutputParser",
"NumberedListOutputParser",
"OutputFixingParser",
"PydanticOutputParser",
"RegexDictParser",

@ -39,6 +39,8 @@ from langchain.load.serializable import Serializable
from langchain.pydantic_v1 import Field
from langchain.schema.runnable.config import (
RunnableConfig,
acall_func_with_variable_args,
call_func_with_variable_args,
ensure_config,
get_async_callback_manager_for_config,
get_callback_manager_for_config,
@ -47,16 +49,15 @@ from langchain.schema.runnable.config import (
patch_config,
)
from langchain.schema.runnable.utils import (
Input,
Output,
accepts_config,
accepts_run_manager,
accepts_run_manager_and_config,
gather_with_concurrency,
)
from langchain.utils.aiter import atee, py_anext
from langchain.utils.iter import safetee
Input = TypeVar("Input")
# Output type should implement __concat__, as eg str, list, dict do
Output = TypeVar("Output")
Other = TypeVar("Other")
@ -311,16 +312,7 @@ class Runnable(Generic[Input, Output], ABC):
name=config.get("run_name"),
)
try:
if accepts_run_manager_and_config(func):
output = func(
input,
run_manager=run_manager,
config=config,
) # type: ignore[call-arg]
elif accepts_run_manager(func):
output = func(input, run_manager=run_manager) # type: ignore[call-arg]
else:
output = func(input) # type: ignore[call-arg]
output = call_func_with_variable_args(func, input, run_manager, config)
except Exception as e:
run_manager.on_chain_error(e)
raise
@ -353,19 +345,9 @@ class Runnable(Generic[Input, Output], ABC):
name=config.get("run_name"),
)
try:
if accepts_run_manager_and_config(func):
output = await func(
input,
run_manager=run_manager,
config=config,
) # type: ignore[call-arg]
elif accepts_run_manager(func):
output = await func(
input,
run_manager=run_manager,
) # type: ignore[call-arg]
else:
output = await func(input) # type: ignore[call-arg]
output = await acall_func_with_variable_args(
func, input, run_manager, config
)
except Exception as e:
await run_manager.on_chain_error(e)
raise
@ -408,16 +390,15 @@ class Runnable(Generic[Input, Output], ABC):
)
]
try:
if accepts_run_manager_and_config(func):
output = func(
input,
run_manager=run_managers,
config=configs,
) # type: ignore[call-arg]
elif accepts_run_manager(func):
output = func(input, run_manager=run_managers) # type: ignore[call-arg]
else:
output = func(input) # type: ignore[call-arg]
kwargs: Dict[str, Any] = {}
if accepts_config(func):
kwargs["config"] = [
patch_config(c, callbacks=rm.get_child())
for c, rm in zip(configs, run_managers)
]
if accepts_run_manager(func):
kwargs["run_manager"] = run_managers
output = func(input, **kwargs) # type: ignore[call-arg]
except Exception as e:
for run_manager in run_managers:
run_manager.on_chain_error(e)
@ -479,16 +460,15 @@ class Runnable(Generic[Input, Output], ABC):
)
)
try:
if accepts_run_manager_and_config(func):
output = await func(
input,
run_manager=run_managers,
config=configs,
) # type: ignore[call-arg]
elif accepts_run_manager(func):
output = await func(input, run_manager=run_managers) # type: ignore
else:
output = await func(input) # type: ignore[call-arg]
kwargs: Dict[str, Any] = {}
if accepts_config(func):
kwargs["config"] = [
patch_config(c, callbacks=rm.get_child())
for c, rm in zip(configs, run_managers)
]
if accepts_run_manager(func):
kwargs["run_manager"] = run_managers
output = await func(input, **kwargs) # type: ignore[call-arg]
except Exception as e:
await asyncio.gather(
*(run_manager.on_chain_error(e) for run_manager in run_managers)
@ -550,19 +530,16 @@ class Runnable(Generic[Input, Output], ABC):
name=config.get("run_name"),
)
try:
if accepts_run_manager_and_config(transformer):
iterator = transformer(
input_for_transform,
run_manager=run_manager,
config=config,
) # type: ignore[call-arg]
elif accepts_run_manager(transformer):
iterator = transformer(
input_for_transform,
run_manager=run_manager,
) # type: ignore[call-arg]
else:
iterator = transformer(input_for_transform) # type: ignore[call-arg]
kwargs: Dict[str, Any] = {}
if accepts_config(transformer):
kwargs["config"] = patch_config(
config, callbacks=run_manager.get_child()
)
if accepts_run_manager(transformer):
kwargs["run_manager"] = run_manager
iterator = transformer(
input_for_transform, **kwargs
) # type: ignore[call-arg]
for chunk in iterator:
yield chunk
if final_output_supported:
@ -631,21 +608,16 @@ class Runnable(Generic[Input, Output], ABC):
name=config.get("run_name"),
)
try:
# mypy can't quite work out thew type guard here, but this is safe,
# check implementations of the accepts_* functions
if accepts_run_manager_and_config(transformer):
iterator = transformer(
input_for_transform,
run_manager=run_manager,
config=config,
) # type: ignore[call-arg]
elif accepts_run_manager(transformer):
iterator = transformer(
input_for_transform,
run_manager=run_manager,
) # type: ignore[call-arg]
else:
iterator = transformer(input_for_transform) # type: ignore[call-arg]
kwargs: Dict[str, Any] = {}
if accepts_config(transformer):
kwargs["config"] = patch_config(
config, callbacks=run_manager.get_child()
)
if accepts_run_manager(transformer):
kwargs["run_manager"] = run_manager
iterator = transformer(
input_for_transform, **kwargs
) # type: ignore[call-arg]
async for chunk in iterator:
yield chunk
if final_output_supported:
@ -1756,7 +1728,7 @@ class RunnableLambda(Runnable[Input, Output]):
run_manager: CallbackManagerForChainRun,
config: RunnableConfig,
) -> Output:
output = self.func(input)
output = call_func_with_variable_args(self.func, input, run_manager, config)
# If the output is a runnable, invoke it
if isinstance(output, Runnable):
recursion_limit = config["recursion_limit"]
@ -1780,7 +1752,9 @@ class RunnableLambda(Runnable[Input, Output]):
run_manager: AsyncCallbackManagerForChainRun,
config: RunnableConfig,
) -> Output:
output = await self.afunc(input)
output = await acall_func_with_variable_args(
self.afunc, input, run_manager, config
)
# If the output is a runnable, invoke it
if isinstance(output, Runnable):
recursion_limit = config["recursion_limit"]
@ -1798,6 +1772,21 @@ class RunnableLambda(Runnable[Input, Output]):
)
return output
def _config(
self, config: Optional[RunnableConfig], callable: Callable[..., Any]
) -> RunnableConfig:
config = config or {}
if config.get("run_name") is None:
try:
run_name = callable.__name__
except AttributeError:
run_name = None
if run_name is not None:
return patch_config(config, run_name=run_name)
return config
def invoke(
self,
input: Input,
@ -1805,7 +1794,11 @@ class RunnableLambda(Runnable[Input, Output]):
**kwargs: Optional[Any],
) -> Output:
if hasattr(self, "func"):
return self._call_with_config(self._invoke, input, config)
return self._call_with_config(
self._invoke,
input,
self._config(config, self.func),
)
else:
raise TypeError(
"Cannot invoke a coroutine function synchronously."
@ -1819,7 +1812,11 @@ class RunnableLambda(Runnable[Input, Output]):
**kwargs: Optional[Any],
) -> Output:
if hasattr(self, "afunc"):
return await self._acall_with_config(self._ainvoke, input, config)
return await self._acall_with_config(
self._ainvoke,
input,
self._config(config, self.afunc),
)
else:
# Delegating to super implementation of ainvoke.
# Uses asyncio executor to run the sync version (invoke)

@ -3,13 +3,35 @@ from __future__ import annotations
from concurrent.futures import Executor, ThreadPoolExecutor
from contextlib import contextmanager
from copy import deepcopy
from typing import TYPE_CHECKING, Any, Dict, Generator, List, Optional, Union
from typing import (
TYPE_CHECKING,
Any,
Awaitable,
Callable,
Dict,
Generator,
List,
Optional,
Union,
)
from typing_extensions import TypedDict
from langchain.schema.runnable.utils import (
Input,
Output,
accepts_config,
accepts_run_manager,
)
if TYPE_CHECKING:
from langchain.callbacks.base import BaseCallbackManager, Callbacks
from langchain.callbacks.manager import AsyncCallbackManager, CallbackManager
from langchain.callbacks.manager import (
AsyncCallbackManager,
AsyncCallbackManagerForChainRun,
CallbackManager,
CallbackManagerForChainRun,
)
class RunnableConfig(TypedDict, total=False):
@ -117,6 +139,47 @@ def patch_config(
return config
def call_func_with_variable_args(
func: Union[
Callable[[Input], Output],
Callable[[Input, CallbackManagerForChainRun], Output],
Callable[[Input, CallbackManagerForChainRun, RunnableConfig], Output],
],
input: Input,
run_manager: CallbackManagerForChainRun,
config: RunnableConfig,
) -> Output:
"""Call function that may optionally accept a run_manager and/or config."""
kwargs: Dict[str, Any] = {}
if accepts_config(func):
kwargs["config"] = patch_config(config, callbacks=run_manager.get_child())
if accepts_run_manager(func):
kwargs["run_manager"] = run_manager
return func(input, **kwargs) # type: ignore[call-arg]
async def acall_func_with_variable_args(
func: Union[
Callable[[Input], Awaitable[Output]],
Callable[[Input, AsyncCallbackManagerForChainRun], Awaitable[Output]],
Callable[
[Input, AsyncCallbackManagerForChainRun, RunnableConfig],
Awaitable[Output],
],
],
input: Input,
run_manager: AsyncCallbackManagerForChainRun,
config: RunnableConfig,
) -> Output:
"""Call function that may optionally accept a run_manager and/or config."""
kwargs: Dict[str, Any] = {}
if accepts_config(func):
kwargs["config"] = patch_config(config, callbacks=run_manager.get_child())
if accepts_run_manager(func):
kwargs["run_manager"] = run_manager
return await func(input, **kwargs) # type: ignore[call-arg]
def get_callback_manager_for_config(config: RunnableConfig) -> CallbackManager:
from langchain.callbacks.manager import CallbackManager

@ -2,7 +2,11 @@ from __future__ import annotations
import asyncio
from inspect import signature
from typing import Any, Callable, Coroutine, Union
from typing import Any, Callable, Coroutine, TypeVar, Union
Input = TypeVar("Input")
# Output type should implement __concat__, as eg str, list, dict do
Output = TypeVar("Output")
async def gated_coro(semaphore: asyncio.Semaphore, coro: Coroutine) -> Any:
@ -26,8 +30,8 @@ def accepts_run_manager(callable: Callable[..., Any]) -> bool:
return False
def accepts_run_manager_and_config(callable: Callable[..., Any]) -> bool:
return (
accepts_run_manager(callable)
and signature(callable).parameters.get("config") is not None
)
def accepts_config(callable: Callable[..., Any]) -> bool:
try:
return signature(callable).parameters.get("config") is not None
except ValueError:
return False

@ -100,6 +100,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
length_function: Callable[[str], int] = len,
keep_separator: bool = False,
add_start_index: bool = False,
strip_whitespace: bool = True,
) -> None:
"""Create a new TextSplitter.
@ -109,6 +110,8 @@ class TextSplitter(BaseDocumentTransformer, ABC):
length_function: Function that measures the length of given chunks
keep_separator: Whether to keep the separator in the chunks
add_start_index: If `True`, includes chunk's start index in metadata
strip_whitespace: If `True`, strips whitespace from the start and end of
every document
"""
if chunk_overlap > chunk_size:
raise ValueError(
@ -120,6 +123,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
self._length_function = length_function
self._keep_separator = keep_separator
self._add_start_index = add_start_index
self._strip_whitespace = strip_whitespace
@abstractmethod
def split_text(self, text: str) -> List[str]:
@ -152,7 +156,8 @@ class TextSplitter(BaseDocumentTransformer, ABC):
def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
text = separator.join(docs)
text = text.strip()
if self._strip_whitespace:
text = text.strip()
if text == "":
return None
else:

@ -108,7 +108,7 @@ def get_client(redis_url: str, **kwargs: Any) -> RedisType:
try:
import redis
except ImportError:
raise ValueError(
raise ImportError(
"Could not import redis python package. "
"Please install it with `pip install redis>=4.1.0`."
)

@ -9,6 +9,7 @@ from sqlalchemy import MetaData, Table, create_engine, inspect, select, text
from sqlalchemy.engine import Engine
from sqlalchemy.exc import ProgrammingError, SQLAlchemyError
from sqlalchemy.schema import CreateTable
from sqlalchemy.types import NullType
from langchain.utils import get_from_env
@ -314,6 +315,11 @@ class SQLDatabase:
tables.append(self._custom_table_info[table.name])
continue
# Ignore JSON datatyped columns
for k, v in table.columns.items():
if type(v.type) is NullType:
table._columns.remove(v)
# add create table command
create_table = str(CreateTable(table).compile(self._engine))
table_info = f"{create_table.rstrip()}"
@ -384,6 +390,8 @@ class SQLDatabase:
connection.exec_driver_sql(f"SET @@dataset_id='{self._schema}'")
elif self.dialect == "mssql":
pass
elif self.dialect == "trino":
connection.exec_driver_sql(f"USE {self._schema}")
else: # postgresql and compatible dialects
connection.exec_driver_sql(f"SET search_path TO {self._schema}")
cursor = connection.execute(text(command))

@ -147,7 +147,12 @@ class MyScale(VectorStore):
)
for k in ["id", "vector", "text", "metadata"]:
assert k in self.config.column_map
assert self.config.metric in ["ip", "cosine", "l2"]
assert self.config.metric.upper() in ["IP", "COSINE", "L2"]
if self.config.metric in ["ip", "cosine", "l2"]:
logger.warning(
"Lower case metric types will be deprecated "
"the future. Please use one of ('IP', 'Cosine', 'L2')"
)
# initialize the schema
dim = len(embedding.embed_query("try this out"))
@ -174,7 +179,9 @@ class MyScale(VectorStore):
self.BS = "\\"
self.must_escape = ("\\", "'")
self._embeddings = embedding
self.dist_order = "ASC" if self.config.metric in ["cosine", "l2"] else "DESC"
self.dist_order = (
"ASC" if self.config.metric.upper() in ["COSINE", "L2"] else "DESC"
)
# Create a connection to myscale
self.client = get_client(

@ -0,0 +1,159 @@
import os
from typing import Any, Dict, Iterable, List, Optional, Type
from langchain.embeddings.base import Embeddings
from langchain.schema.document import Document
from langchain.vectorstores.base import VST, VectorStore
FIELD_TYPES = {
"f": "files",
"t": "texts",
"l": "links",
}
class NucliaDB(VectorStore):
"""NucliaDB vector store."""
_config: Dict[str, Any] = {}
def __init__(
self,
knowledge_box: str,
local: bool,
api_key: Optional[str] = None,
backend: Optional[str] = None,
) -> None:
"""Initialize the NucliaDB client.
Args:
knowledge_box: the Knowledge Box id.
local: Whether to use a local NucliaDB instance or Nuclia Cloud
api_key: A contributor API key for the kb (needed when local is False)
backend: The backend url to use when local is True, defaults to
http://localhost:8080
"""
try:
from nuclia.sdk import NucliaAuth
except ImportError:
raise ValueError(
"nuclia python package not found. "
"Please install it with `pip install nuclia`."
)
self._config["LOCAL"] = local
zone = os.environ.get("NUCLIA_ZONE", "europe-1")
self._kb = knowledge_box
if local:
if not backend:
backend = "http://localhost:8080"
self._config["BACKEND"] = f"{backend}/api/v1"
self._config["TOKEN"] = None
NucliaAuth().nucliadb(url=backend)
NucliaAuth().kb(url=self.kb_url, interactive=False)
else:
self._config["BACKEND"] = f"https://{zone}.nuclia.cloud/api/v1"
self._config["TOKEN"] = api_key
NucliaAuth().kb(
url=self.kb_url, token=self._config["TOKEN"], interactive=False
)
@property
def is_local(self) -> str:
return self._config["LOCAL"]
@property
def kb_url(self) -> str:
return f"{self._config['BACKEND']}/kb/{self._kb}"
def add_texts(
self,
texts: Iterable[str],
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> List[str]:
"""Upload texts to NucliaDB"""
ids = []
from nuclia.sdk import NucliaResource
factory = NucliaResource()
for i, text in enumerate(texts):
extra: Dict[str, Any] = {"metadata": ""}
if metadatas:
extra = {"metadata": metadatas[i]}
id = factory.create(
texts={"text": {"body": text}},
extra=extra,
url=self.kb_url,
api_key=self._config["TOKEN"],
)
ids.append(id)
return ids
def delete(self, ids: Optional[List[str]] = None, **kwargs: Any) -> Optional[bool]:
if not ids:
return None
from nuclia.sdk import NucliaResource
factory = NucliaResource()
results: List[bool] = []
for id in ids:
try:
factory.delete(rid=id, url=self.kb_url, api_key=self._config["TOKEN"])
results.append(True)
except ValueError:
results.append(False)
return all(results)
def similarity_search(
self, query: str, k: int = 4, **kwargs: Any
) -> List[Document]:
from nuclia.sdk import NucliaSearch
from nucliadb_models.search import FindRequest, ResourceProperties
request = FindRequest(
query=query,
page_size=k,
show=[ResourceProperties.VALUES, ResourceProperties.EXTRA],
)
search = NucliaSearch()
results = search.find(
query=request, url=self.kb_url, api_key=self._config["TOKEN"]
)
paragraphs = []
for resource in results.resources.values():
for field in resource.fields.values():
for paragraph_id, paragraph in field.paragraphs.items():
info = paragraph_id.split("/")
field_type = FIELD_TYPES.get(info[1], None)
field_id = info[2]
if not field_type:
continue
value = getattr(resource.data, field_type, {}).get(field_id, None)
paragraphs.append(
{
"text": paragraph.text,
"metadata": {
"extra": getattr(
getattr(resource, "extra", {}), "metadata", None
),
"value": value,
},
"order": paragraph.order,
}
)
sorted_paragraphs = sorted(paragraphs, key=lambda x: x["order"])
return [
Document(page_content=paragraph["text"], metadata=paragraph["metadata"])
for paragraph in sorted_paragraphs
]
@classmethod
def from_texts(
cls: Type[VST],
texts: List[str],
embedding: Embeddings,
metadatas: Optional[List[dict]] = None,
**kwargs: Any,
) -> VST:
"""Return VectorStore initialized from texts and embeddings."""
raise NotImplementedError

@ -349,16 +349,16 @@ class PGVector(VectorStore):
@property
def distance_strategy(self) -> Any:
if self._distance_strategy == "l2":
if self._distance_strategy == DistanceStrategy.EUCLIDEAN:
return self.EmbeddingStore.embedding.l2_distance
elif self._distance_strategy == "cosine":
elif self._distance_strategy == DistanceStrategy.COSINE:
return self.EmbeddingStore.embedding.cosine_distance
elif self._distance_strategy == "inner":
elif self._distance_strategy == DistanceStrategy.MAX_INNER_PRODUCT:
return self.EmbeddingStore.embedding.max_inner_product
else:
raise ValueError(
f"Got unexpected value for distance: {self._distance_strategy}. "
f"Should be one of `l2`, `cosine`, `inner`."
f"Should be one of {', '.join([ds.value for ds in DistanceStrategy])}."
)
def similarity_search_with_score_by_vector(

File diff suppressed because it is too large Load Diff

@ -1,6 +1,6 @@
[tool.poetry]
name = "langchain"
version = "0.0.280"
version = "0.0.284"
description = "Building applications with LLMs through composability"
authors = []
license = "MIT"

@ -8,3 +8,4 @@ _EXAMPLES_DIR = _THIS_DIR / "integration_tests" / "examples"
# Paths to test PDF files
HELLO_PDF = _EXAMPLES_DIR / "hello.pdf"
LAYOUT_PARSER_PAPER_PDF = _EXAMPLES_DIR / "layout-parser-paper.pdf"
DUPLICATE_CHARS = _EXAMPLES_DIR / "duplicate-chars.pdf"

@ -19,6 +19,10 @@ LAYOUT_PARSER_PAPER_PDF = (
Path(__file__).parent.parent.parent / "examples" / "layout-parser-paper.pdf"
)
DUPLICATE_CHARS = (
Path(__file__).parent.parent.parent / "examples" / "duplicate-chars.pdf"
)
def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) -> None:
"""Standard tests to verify that the given parser works.
@ -59,6 +63,26 @@ def _assert_with_parser(parser: BaseBlobParser, splits_by_page: bool = True) ->
assert metadata["page"] == 0
def _assert_with_duplicate_parser(parser: BaseBlobParser, dedupe: bool = False) -> None:
"""PDFPlumber tests to verify that duplicate characters appear or not
Args:
parser (BaseBlobParser): The parser to test.
splits_by_page (bool): Whether the parser splits by page or not by default.
dedupe: Avoiding the error of duplicate characters if `dedupe=True`.
"""
blob = Blob.from_path(DUPLICATE_CHARS)
doc_generator = parser.lazy_parse(blob)
assert isinstance(doc_generator, Iterator)
docs = list(doc_generator)
if dedupe:
# use dedupe avoid duplicate characters.
assert "1000 Series" == docs[0].page_content.split("\n")[0]
else:
# duplicate characters will appear in doc if not dedupe
assert "11000000 SSeerriieess" == docs[0].page_content.split("\n")[0]
def test_pymupdf_loader() -> None:
"""Test PyMuPDF loader."""
_assert_with_parser(PyMuPDFParser())
@ -84,3 +108,5 @@ def test_pypdfium2_parser() -> None:
def test_pdfplumber_parser() -> None:
"""Test PDFPlumber parser."""
_assert_with_parser(PDFPlumberParser())
_assert_with_duplicate_parser(PDFPlumberParser())
_assert_with_duplicate_parser(PDFPlumberParser(dedupe=True), dedupe=True)

@ -7,7 +7,9 @@ from langchain.document_loaders import PlaywrightURLLoader
from langchain.document_loaders.url_playwright import PlaywrightEvaluator
if TYPE_CHECKING:
from playwright.async_api import AsyncBrowser, AsyncPage, AsyncResponse
from playwright.async_api import Browser as AsyncBrowser
from playwright.async_api import Page as AsyncPage
from playwright.async_api import Response as AsyncResponse
from playwright.sync_api import Browser, Page, Response

@ -0,0 +1,98 @@
from typing import Any
from unittest import mock
from langchain.vectorstores.nucliadb import NucliaDB
class attrdict(dict):
def __getitem__(self, key: str) -> Any:
value = dict.__getitem__(self, key)
return attrdict(value) if isinstance(value, dict) else value
__getattr__ = __getitem__
def FakeCreate(**args: Any) -> Any:
def fn(self: Any, **kwargs: Any) -> str:
return "fake_uuid"
return fn
def FakeDelete(**args: Any) -> Any:
def fn(self: Any, **kwargs: Any) -> None:
return None
return fn
def FakeFind(**args: Any) -> Any:
def fn(self: Any, **kwargs: Any) -> Any:
return attrdict(
{
"resources": {
"123": attrdict(
{
"fields": {
"456": attrdict(
{
"paragraphs": {
"123/t/text/0-14": attrdict(
{
"text": "This is a test",
"order": 0,
}
),
}
}
)
},
"data": {
"texts": {
"text": {
"body": "This is a test",
}
}
},
"extra": attrdict({"metadata": {"some": "metadata"}}),
}
)
}
}
)
return fn
def test_add_texts() -> None:
with mock.patch(
"nuclia.sdk.resource.NucliaResource.create",
new_callable=FakeCreate,
):
ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY")
assert ndb.is_local is False
ids = ndb.add_texts(["This is a new test", "This is a second test"])
assert len(ids) == 2
def test_delete() -> None:
with mock.patch(
"nuclia.sdk.resource.NucliaResource.delete",
new_callable=FakeDelete,
):
ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY")
success = ndb.delete(["123", "456"])
assert success
def test_search() -> None:
with mock.patch(
"nuclia.sdk.search.NucliaSearch.find",
new_callable=FakeFind,
):
ndb = NucliaDB(knowledge_box="YOUR_KB_ID", local=False, api_key="YOUR_API_KEY")
results = ndb.similarity_search("Who was inspired by Ada Lovelace?")
assert len(results) == 1
assert results[0].page_content == "This is a test"
assert results[0].metadata["extra"]["some"] == "metadata"
assert results[0].metadata["value"]["body"] == "This is a test"

File diff suppressed because one or more lines are too long

@ -948,7 +948,7 @@ async def test_higher_order_lambda_runnable(
parent_run = next(r for r in tracer.runs if r.parent_run_id is None)
assert len(parent_run.child_runs) == 2
router_run = parent_run.child_runs[1]
assert router_run.name == "RunnableLambda"
assert router_run.name == "router"
assert len(router_run.child_runs) == 1
math_run = router_run.child_runs[0]
assert math_run.name == "RunnableSequence"
@ -980,7 +980,7 @@ async def test_higher_order_lambda_runnable(
parent_run = next(r for r in tracer.runs if r.parent_run_id is None)
assert len(parent_run.child_runs) == 2
router_run = parent_run.child_runs[1]
assert router_run.name == "RunnableLambda"
assert router_run.name == "arouter"
assert len(router_run.child_runs) == 1
math_run = router_run.child_runs[0]
assert math_run.name == "RunnableSequence"

Loading…
Cancel
Save