From 89ef440c14634e74bae069925ef75d3ff4b12445 Mon Sep 17 00:00:00 2001
From: Palau <paul.vorobyev@gmail.com>
Date: Mon, 25 Sep 2023 16:10:13 -0400
Subject: [PATCH] Kay retriever (#10657)

- **Description**: Adding retrievers for [kay.ai](https://kay.ai) and
SEC filings powered by Kay and Cybersyn. Kay provides context as a
service: it's an API built for RAG.
- **Issue**: N/A
- **Dependencies**: Just added a dep to the
[kay](https://pypi.org/project/kay/) package
- **Tag maintainer**: @baskaryan @hwchase17 Discussed in slack
- **Twtter handle:** [@vishalrohra_](https://twitter.com/vishalrohra_)

---------

Co-authored-by: Bagatur <baskaryan@gmail.com>
---
 .../document_loaders/aws_s3_directory.ipynb   |  305 ++--
 .../document_loaders/aws_s3_file.ipynb        |  233 +--
 .../dynamodb_chat_message_history.ipynb       |  684 ++++-----
 docs/extras/integrations/retrievers/kay.ipynb |  207 +++
 .../integrations/retrievers/pubmed.ipynb      |    2 +-
 .../integrations/retrievers/sec_filings.ipynb |  165 +++
 docs/extras/use_cases/web_scraping.ipynb      | 1299 +++++++++--------
 .../langchain/retrievers/__init__.py          |    2 +
 libs/langchain/langchain/retrievers/kay.py    |   59 +
 .../integration_tests/retrievers/test_kay.py  |   24 +
 10 files changed, 1722 insertions(+), 1258 deletions(-)
 create mode 100644 docs/extras/integrations/retrievers/kay.ipynb
 create mode 100644 docs/extras/integrations/retrievers/sec_filings.ipynb
 create mode 100644 libs/langchain/langchain/retrievers/kay.py
 create mode 100644 libs/langchain/tests/integration_tests/retrievers/test_kay.py

diff --git a/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb b/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb
index 4d474ca64a..e40a3702c6 100644
--- a/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb
+++ b/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb
@@ -1,156 +1,159 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "a634365e",
-   "metadata": {},
-   "source": [
-    "# AWS S3 Directory\n",
-    "\n",
-    ">[Amazon Simple Storage Service (Amazon S3)](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html) is an object storage service\n",
-    "\n",
-    ">[AWS S3 Directory](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html)\n",
-    "\n",
-    "This covers how to load document objects from an `AWS S3 Directory` object."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "49815096",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "#!pip install boto3"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "2f0cd6a5",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "from langchain.document_loaders import S3DirectoryLoader"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "321cc7f1",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "loader = S3DirectoryLoader(\"testing-hwc\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2b11d155",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "loader.load()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "0690c40a",
-   "metadata": {},
-   "source": [
-    "## Specifying a prefix\n",
-    "You can also specify a prefix for more finegrained control over what files to load."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "72d44781",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "loader = S3DirectoryLoader(\"testing-hwc\", prefix=\"fake\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "2d3c32db",
-   "metadata": {},
-   "outputs": [
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "id": "a634365e",
+      "metadata": {},
+      "source": [
+        "# AWS S3 Directory\n",
+        "\n",
+        ">[Amazon Simple Storage Service (Amazon S3)](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html) is an object storage service\n",
+        "\n",
+        ">[AWS S3 Directory](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html)\n",
+        "\n",
+        "This covers how to load document objects from an `AWS S3 Directory` object."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "49815096",
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [],
+      "source": [
+        "#!pip install boto3"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "id": "2f0cd6a5",
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [],
+      "source": [
+        "from langchain.document_loaders import S3DirectoryLoader"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "id": "321cc7f1",
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [],
+      "source": [
+        "loader = S3DirectoryLoader(\"testing-hwc\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "2b11d155",
+      "metadata": {
+        "tags": []
+      },
+      "outputs": [],
+      "source": [
+        "loader.load()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "0690c40a",
+      "metadata": {},
+      "source": [
+        "## Specifying a prefix\n",
+        "You can also specify a prefix for more finegrained control over what files to load."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "id": "72d44781",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "loader = S3DirectoryLoader(\"testing-hwc\", prefix=\"fake\")"
+      ]
+    },
     {
-     "data": {
-      "text/plain": [
-       "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]"
+      "cell_type": "code",
+      "execution_count": 6,
+      "id": "2d3c32db",
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]"
+            ]
+          },
+          "execution_count": 6,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "loader.load()"
       ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## Configuring the AWS Boto3 client\n",
+        "You can configure the AWS [Boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) client by passing\n",
+        "named arguments when creating the S3DirectoryLoader.\n",
+        "This is useful for instance when AWS credentials can't be set as environment variables.\n",
+        "See the [list of parameters](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session) that can be configured."
+      ],
+      "metadata": {},
+      "id": "91a7ac07"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "outputs": [],
+      "source": [
+        "loader = S3DirectoryLoader(\"testing-hwc\", aws_access_key_id=\"xxxx\", aws_secret_access_key=\"yyyy\")"
+      ],
+      "metadata": {},
+      "id": "f485ec8c"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "outputs": [],
+      "source": [
+        "loader.load()"
+      ],
+      "metadata": {},
+      "id": "c0fa76ae"
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.6"
     }
-   ],
-   "source": [
-    "loader.load()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "source": [
-    "## Configuring the AWS Boto3 client\n",
-    "You can configure the AWS [Boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) client by passing\n",
-    "named arguments when creating the S3DirectoryLoader.\n",
-    "This is useful for instance when AWS credentials can't be set as environment variables.\n",
-    "See the [list of parameters](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session) that can be configured."
-   ],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "outputs": [],
-   "source": [
-    "loader = S3DirectoryLoader(\"testing-hwc\", aws_access_key_id=\"xxxx\", aws_secret_access_key=\"yyyy\")"
-   ],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "outputs": [],
-   "source": [
-    "loader.load()"
-   ],
-   "metadata": {}
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
   },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
+  "nbformat": 4,
+  "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/docs/extras/integrations/document_loaders/aws_s3_file.ipynb b/docs/extras/integrations/document_loaders/aws_s3_file.ipynb
index 91d7751eb6..a13fcf5cff 100644
--- a/docs/extras/integrations/document_loaders/aws_s3_file.ipynb
+++ b/docs/extras/integrations/document_loaders/aws_s3_file.ipynb
@@ -1,121 +1,122 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "66a7777e",
-   "metadata": {},
-   "source": [
-    "# AWS S3 File\n",
-    "\n",
-    ">[Amazon Simple Storage Service (Amazon S3)](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html) is an object storage service.\n",
-    "\n",
-    ">[AWS S3 Buckets](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingBucket.html)\n",
-    "\n",
-    "This covers how to load document objects from an `AWS S3 File` object."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "id": "9ec8a3b3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.document_loaders import S3FileLoader"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "43128d8d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#!pip install boto3"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "35d6809a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "loader = S3FileLoader(\"testing-hwc\", \"fake.docx\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "efd6be84",
-   "metadata": {},
-   "outputs": [
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "id": "66a7777e",
+      "metadata": {},
+      "source": [
+        "# AWS S3 File\n",
+        "\n",
+        ">[Amazon Simple Storage Service (Amazon S3)](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html) is an object storage service.\n",
+        "\n",
+        ">[AWS S3 Buckets](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingBucket.html)\n",
+        "\n",
+        "This covers how to load document objects from an `AWS S3 File` object."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "id": "9ec8a3b3",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from langchain.document_loaders import S3FileLoader"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "id": "43128d8d",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "#!pip install boto3"
+      ]
+    },
     {
-     "data": {
-      "text/plain": [
-       "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]"
+      "cell_type": "code",
+      "execution_count": 8,
+      "id": "35d6809a",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "loader = S3FileLoader(\"testing-hwc\", \"fake.docx\")"
       ]
-     },
-     "execution_count": 9,
-     "metadata": {},
-     "output_type": "execute_result"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "id": "efd6be84",
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]"
+            ]
+          },
+          "execution_count": 9,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "loader.load()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "93689594",
+      "metadata": {},
+      "source": [
+        "## Configuring the AWS Boto3 client\n",
+        "You can configure the AWS [Boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) client by passing\n",
+        "named arguments when creating the S3DirectoryLoader.\n",
+        "This is useful for instance when AWS credentials can't be set as environment variables.\n",
+        "See the [list of parameters](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session) that can be configured."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "outputs": [],
+      "source": [
+        "loader = S3FileLoader(\"testing-hwc\", \"fake.docx\", aws_access_key_id=\"xxxx\", aws_secret_access_key=\"yyyy\")"
+      ],
+      "metadata": {},
+      "id": "43106ee8"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "outputs": [],
+      "source": [
+        "loader.load()"
+      ],
+      "metadata": {},
+      "id": "1764a727"
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.10.6"
     }
-   ],
-   "source": [
-    "loader.load()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "93689594",
-   "metadata": {},
-   "source": [
-    "## Configuring the AWS Boto3 client\n",
-    "You can configure the AWS [Boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) client by passing\n",
-    "named arguments when creating the S3DirectoryLoader.\n",
-    "This is useful for instance when AWS credentials can't be set as environment variables.\n",
-    "See the [list of parameters](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session) that can be configured."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "outputs": [],
-   "source": [
-    "loader = S3FileLoader(\"testing-hwc\", \"fake.docx\", aws_access_key_id=\"xxxx\", aws_secret_access_key=\"yyyy\")"
-   ],
-   "metadata": {}
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "outputs": [],
-   "source": [
-    "loader.load()"
-   ],
-   "metadata": {}
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
   },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.10.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
-
+  "nbformat": 4,
+  "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/docs/extras/integrations/memory/dynamodb_chat_message_history.ipynb b/docs/extras/integrations/memory/dynamodb_chat_message_history.ipynb
index 53e7230e2b..3f324ef5e8 100644
--- a/docs/extras/integrations/memory/dynamodb_chat_message_history.ipynb
+++ b/docs/extras/integrations/memory/dynamodb_chat_message_history.ipynb
@@ -1,350 +1,352 @@
 {
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "91c6a7ef",
-   "metadata": {},
-   "source": [
-    "# Dynamodb Chat Message History\n",
-    "\n",
-    "This notebook goes over how to use Dynamodb to store chat message history."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3f608be0",
-   "metadata": {},
-   "source": [
-    "First make sure you have correctly configured the [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html). Then make sure you have installed boto3."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "030d784f",
-   "metadata": {},
-   "source": [
-    "Next, create the DynamoDB Table where we will be storing messages:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "id": "93ce1811",
-   "metadata": {},
-   "outputs": [
+  "cells": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0\n"
-     ]
-    }
-   ],
-   "source": [
-    "import boto3\n",
-    "\n",
-    "# Get the service resource.\n",
-    "dynamodb = boto3.resource(\"dynamodb\")\n",
-    "\n",
-    "# Create the DynamoDB table.\n",
-    "table = dynamodb.create_table(\n",
-    "    TableName=\"SessionTable\",\n",
-    "    KeySchema=[{\"AttributeName\": \"SessionId\", \"KeyType\": \"HASH\"}],\n",
-    "    AttributeDefinitions=[{\"AttributeName\": \"SessionId\", \"AttributeType\": \"S\"}],\n",
-    "    BillingMode=\"PAY_PER_REQUEST\",\n",
-    ")\n",
-    "\n",
-    "# Wait until the table exists.\n",
-    "table.meta.client.get_waiter(\"table_exists\").wait(TableName=\"SessionTable\")\n",
-    "\n",
-    "# Print out some data about the table.\n",
-    "print(table.item_count)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1a9b310b",
-   "metadata": {},
-   "source": [
-    "## DynamoDBChatMessageHistory"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "d15e3302",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.memory.chat_message_histories import DynamoDBChatMessageHistory\n",
-    "\n",
-    "history = DynamoDBChatMessageHistory(table_name=\"SessionTable\", session_id=\"0\")\n",
-    "\n",
-    "history.add_user_message(\"hi!\")\n",
-    "\n",
-    "history.add_ai_message(\"whats up?\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "64fc465e",
-   "metadata": {},
-   "outputs": [
+      "cell_type": "markdown",
+      "id": "91c6a7ef",
+      "metadata": {},
+      "source": [
+        "# Dynamodb Chat Message History\n",
+        "\n",
+        "This notebook goes over how to use Dynamodb to store chat message history."
+      ]
+    },
     {
-     "data": {
-      "text/plain": "[HumanMessage(content='hi!', additional_kwargs={}, example=False),\n AIMessage(content='whats up?', additional_kwargs={}, example=False),\n HumanMessage(content='hi!', additional_kwargs={}, example=False),\n AIMessage(content='whats up?', additional_kwargs={}, example=False)]"
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "history.messages"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "955f1b15",
-   "metadata": {},
-   "source": [
-    "## DynamoDBChatMessageHistory with Custom Endpoint URL\n",
-    "\n",
-    "Sometimes it is useful to specify the URL to the AWS endpoint to connect to. For instance, when you are running locally against [Localstack](https://localstack.cloud/). For those cases you can specify the URL via the `endpoint_url` parameter in the constructor."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "id": "225713c8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.memory.chat_message_histories import DynamoDBChatMessageHistory\n",
-    "\n",
-    "history = DynamoDBChatMessageHistory(\n",
-    "    table_name=\"SessionTable\",\n",
-    "    session_id=\"0\",\n",
-    "    endpoint_url=\"http://localhost.localstack.cloud:4566\",\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "source": [
-    "## DynamoDBChatMessageHistory With Different Keys Composite Keys\n",
-    "The default key for DynamoDBChatMessageHistory is ```{\"SessionId\": self.session_id}```, but you can modify this to match your table design.\n",
-    "\n",
-    "### Primary Key Name\n",
-    "You may modify the primary key by passing in a primary_key_name value in the constructor, resulting in the following:\n",
-    "```{self.primary_key_name: self.session_id}```\n",
-    "\n",
-    "### Composite Keys\n",
-    "When using an existing DynamoDB table, you may need to modify the key structure from the default of to something including a Sort Key. To do this you may use the ```key``` parameter.\n",
-    "\n",
-    "Passing a value for key will override the primary_key parameter, and the resulting key structure will be the passed value.\n"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "outputs": [
+      "cell_type": "markdown",
+      "id": "3f608be0",
+      "metadata": {},
+      "source": [
+        "First make sure you have correctly configured the [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html). Then make sure you have installed boto3."
+      ]
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "0\n"
-     ]
+      "cell_type": "markdown",
+      "id": "030d784f",
+      "metadata": {},
+      "source": [
+        "Next, create the DynamoDB Table where we will be storing messages:"
+      ]
     },
     {
-     "data": {
-      "text/plain": "[HumanMessage(content='hello, composite dynamodb table!', additional_kwargs={}, example=False)]"
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from langchain.memory.chat_message_histories import DynamoDBChatMessageHistory\n",
-    "\n",
-    "composite_table = dynamodb.create_table(\n",
-    "    TableName=\"CompositeTable\",\n",
-    "    KeySchema=[{\"AttributeName\": \"PK\", \"KeyType\": \"HASH\"}, {\"AttributeName\": \"SK\", \"KeyType\": \"RANGE\"}],\n",
-    "    AttributeDefinitions=[{\"AttributeName\": \"PK\", \"AttributeType\": \"S\"}, {\"AttributeName\": \"SK\", \"AttributeType\": \"S\"}],\n",
-    "    BillingMode=\"PAY_PER_REQUEST\",\n",
-    ")\n",
-    "\n",
-    "# Wait until the table exists.\n",
-    "composite_table.meta.client.get_waiter(\"table_exists\").wait(TableName=\"CompositeTable\")\n",
-    "\n",
-    "# Print out some data about the table.\n",
-    "print(composite_table.item_count)\n",
-    "\n",
-    "my_key = {\n",
-    "    \"PK\": \"session_id::0\",\n",
-    "    \"SK\":  \"langchain_history\",\n",
-    "}\n",
-    "\n",
-    "composite_key_history = DynamoDBChatMessageHistory(\n",
-    "    table_name=\"CompositeTable\",\n",
-    "    session_id=\"0\",\n",
-    "    endpoint_url=\"http://localhost.localstack.cloud:4566\",\n",
-    "    key=my_key,\n",
-    ")\n",
-    "\n",
-    "composite_key_history.add_user_message(\"hello, composite dynamodb table!\")\n",
-    "\n",
-    "composite_key_history.messages"
-   ],
-   "metadata": {
-    "collapsed": false
-   }
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "id": "3b33c988",
-   "metadata": {},
-   "source": [
-    "## Agent with DynamoDB Memory"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "id": "f92d9499",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.agents import Tool\n",
-    "from langchain.memory import ConversationBufferMemory\n",
-    "from langchain.chat_models import ChatOpenAI\n",
-    "from langchain.agents import initialize_agent\n",
-    "from langchain.agents import AgentType\n",
-    "from langchain.utilities import PythonREPL\n",
-    "from getpass import getpass\n",
-    "\n",
-    "message_history = DynamoDBChatMessageHistory(table_name=\"SessionTable\", session_id=\"1\")\n",
-    "memory = ConversationBufferMemory(\n",
-    "    memory_key=\"chat_history\", chat_memory=message_history, return_messages=True\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "id": "1167eeba",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "python_repl = PythonREPL()\n",
-    "\n",
-    "# You can create the tool to pass to an agent\n",
-    "tools = [\n",
-    "    Tool(\n",
-    "        name=\"python_repl\",\n",
-    "        description=\"A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.\",\n",
-    "        func=python_repl.run,\n",
-    "    )\n",
-    "]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "id": "fce085c5",
-   "metadata": {},
-   "outputs": [
+      "cell_type": "code",
+      "execution_count": 10,
+      "id": "93ce1811",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "0\n"
+          ]
+        }
+      ],
+      "source": [
+        "import boto3\n",
+        "\n",
+        "# Get the service resource.\n",
+        "dynamodb = boto3.resource(\"dynamodb\")\n",
+        "\n",
+        "# Create the DynamoDB table.\n",
+        "table = dynamodb.create_table(\n",
+        "    TableName=\"SessionTable\",\n",
+        "    KeySchema=[{\"AttributeName\": \"SessionId\", \"KeyType\": \"HASH\"}],\n",
+        "    AttributeDefinitions=[{\"AttributeName\": \"SessionId\", \"AttributeType\": \"S\"}],\n",
+        "    BillingMode=\"PAY_PER_REQUEST\",\n",
+        ")\n",
+        "\n",
+        "# Wait until the table exists.\n",
+        "table.meta.client.get_waiter(\"table_exists\").wait(TableName=\"SessionTable\")\n",
+        "\n",
+        "# Print out some data about the table.\n",
+        "print(table.item_count)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "1a9b310b",
+      "metadata": {},
+      "source": [
+        "## DynamoDBChatMessageHistory"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 11,
+      "id": "d15e3302",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from langchain.memory.chat_message_histories import DynamoDBChatMessageHistory\n",
+        "\n",
+        "history = DynamoDBChatMessageHistory(table_name=\"SessionTable\", session_id=\"0\")\n",
+        "\n",
+        "history.add_user_message(\"hi!\")\n",
+        "\n",
+        "history.add_ai_message(\"whats up?\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "id": "64fc465e",
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": "[HumanMessage(content='hi!', additional_kwargs={}, example=False),\n AIMessage(content='whats up?', additional_kwargs={}, example=False),\n HumanMessage(content='hi!', additional_kwargs={}, example=False),\n AIMessage(content='whats up?', additional_kwargs={}, example=False)]"
+          },
+          "execution_count": 12,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "history.messages"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "955f1b15",
+      "metadata": {},
+      "source": [
+        "## DynamoDBChatMessageHistory with Custom Endpoint URL\n",
+        "\n",
+        "Sometimes it is useful to specify the URL to the AWS endpoint to connect to. For instance, when you are running locally against [Localstack](https://localstack.cloud/). For those cases you can specify the URL via the `endpoint_url` parameter in the constructor."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "id": "225713c8",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from langchain.memory.chat_message_histories import DynamoDBChatMessageHistory\n",
+        "\n",
+        "history = DynamoDBChatMessageHistory(\n",
+        "    table_name=\"SessionTable\",\n",
+        "    session_id=\"0\",\n",
+        "    endpoint_url=\"http://localhost.localstack.cloud:4566\",\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "## DynamoDBChatMessageHistory With Different Keys Composite Keys\n",
+        "The default key for DynamoDBChatMessageHistory is ```{\"SessionId\": self.session_id}```, but you can modify this to match your table design.\n",
+        "\n",
+        "### Primary Key Name\n",
+        "You may modify the primary key by passing in a primary_key_name value in the constructor, resulting in the following:\n",
+        "```{self.primary_key_name: self.session_id}```\n",
+        "\n",
+        "### Composite Keys\n",
+        "When using an existing DynamoDB table, you may need to modify the key structure from the default of to something including a Sort Key. To do this you may use the ```key``` parameter.\n",
+        "\n",
+        "Passing a value for key will override the primary_key parameter, and the resulting key structure will be the passed value.\n"
+      ],
+      "metadata": {
+        "collapsed": false
+      },
+      "id": "c9bc0693"
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 14,
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "0\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": "[HumanMessage(content='hello, composite dynamodb table!', additional_kwargs={}, example=False)]"
+          },
+          "execution_count": 14,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "from langchain.memory.chat_message_histories import DynamoDBChatMessageHistory\n",
+        "\n",
+        "composite_table = dynamodb.create_table(\n",
+        "    TableName=\"CompositeTable\",\n",
+        "    KeySchema=[{\"AttributeName\": \"PK\", \"KeyType\": \"HASH\"}, {\"AttributeName\": \"SK\", \"KeyType\": \"RANGE\"}],\n",
+        "    AttributeDefinitions=[{\"AttributeName\": \"PK\", \"AttributeType\": \"S\"}, {\"AttributeName\": \"SK\", \"AttributeType\": \"S\"}],\n",
+        "    BillingMode=\"PAY_PER_REQUEST\",\n",
+        ")\n",
+        "\n",
+        "# Wait until the table exists.\n",
+        "composite_table.meta.client.get_waiter(\"table_exists\").wait(TableName=\"CompositeTable\")\n",
+        "\n",
+        "# Print out some data about the table.\n",
+        "print(composite_table.item_count)\n",
+        "\n",
+        "my_key = {\n",
+        "    \"PK\": \"session_id::0\",\n",
+        "    \"SK\":  \"langchain_history\",\n",
+        "}\n",
+        "\n",
+        "composite_key_history = DynamoDBChatMessageHistory(\n",
+        "    table_name=\"CompositeTable\",\n",
+        "    session_id=\"0\",\n",
+        "    endpoint_url=\"http://localhost.localstack.cloud:4566\",\n",
+        "    key=my_key,\n",
+        ")\n",
+        "\n",
+        "composite_key_history.add_user_message(\"hello, composite dynamodb table!\")\n",
+        "\n",
+        "composite_key_history.messages"
+      ],
+      "metadata": {
+        "collapsed": false
+      },
+      "id": "a7fa0331"
+    },
     {
-     "ename": "ValidationError",
-     "evalue": "1 validation error for ChatOpenAI\n__root__\n  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass  `openai_api_key` as a named parameter. (type=value_error)",
-     "output_type": "error",
-     "traceback": [
-      "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m",
-      "\u001B[0;31mValidationError\u001B[0m                           Traceback (most recent call last)",
-      "Cell \u001B[0;32mIn[17], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m llm \u001B[38;5;241m=\u001B[39m \u001B[43mChatOpenAI\u001B[49m\u001B[43m(\u001B[49m\u001B[43mtemperature\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;241;43m0\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[1;32m      2\u001B[0m agent_chain \u001B[38;5;241m=\u001B[39m initialize_agent(\n\u001B[1;32m      3\u001B[0m     tools,\n\u001B[1;32m      4\u001B[0m     llm,\n\u001B[0;32m   (...)\u001B[0m\n\u001B[1;32m      7\u001B[0m     memory\u001B[38;5;241m=\u001B[39mmemory,\n\u001B[1;32m      8\u001B[0m )\n",
-      "File \u001B[0;32m~/Documents/projects/langchain/libs/langchain/langchain/load/serializable.py:74\u001B[0m, in \u001B[0;36mSerializable.__init__\u001B[0;34m(self, **kwargs)\u001B[0m\n\u001B[1;32m     73\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m__init__\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs: Any) \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[0;32m---> 74\u001B[0m     \u001B[38;5;28;43msuper\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[38;5;21;43m__init__\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m     75\u001B[0m     \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_lc_kwargs \u001B[38;5;241m=\u001B[39m kwargs\n",
-      "File \u001B[0;32m~/Documents/projects/langchain/.venv/lib/python3.9/site-packages/pydantic/main.py:341\u001B[0m, in \u001B[0;36mpydantic.main.BaseModel.__init__\u001B[0;34m()\u001B[0m\n",
-      "\u001B[0;31mValidationError\u001B[0m: 1 validation error for ChatOpenAI\n__root__\n  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass  `openai_api_key` as a named parameter. (type=value_error)"
-     ]
+      "attachments": {},
+      "cell_type": "markdown",
+      "id": "3b33c988",
+      "metadata": {},
+      "source": [
+        "## Agent with DynamoDB Memory"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 15,
+      "id": "f92d9499",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from langchain.agents import Tool\n",
+        "from langchain.memory import ConversationBufferMemory\n",
+        "from langchain.chat_models import ChatOpenAI\n",
+        "from langchain.agents import initialize_agent\n",
+        "from langchain.agents import AgentType\n",
+        "from langchain.utilities import PythonREPL\n",
+        "from getpass import getpass\n",
+        "\n",
+        "message_history = DynamoDBChatMessageHistory(table_name=\"SessionTable\", session_id=\"1\")\n",
+        "memory = ConversationBufferMemory(\n",
+        "    memory_key=\"chat_history\", chat_memory=message_history, return_messages=True\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 16,
+      "id": "1167eeba",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "python_repl = PythonREPL()\n",
+        "\n",
+        "# You can create the tool to pass to an agent\n",
+        "tools = [\n",
+        "    Tool(\n",
+        "        name=\"python_repl\",\n",
+        "        description=\"A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.\",\n",
+        "        func=python_repl.run,\n",
+        "    )\n",
+        "]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 17,
+      "id": "fce085c5",
+      "metadata": {},
+      "outputs": [
+        {
+          "ename": "ValidationError",
+          "evalue": "1 validation error for ChatOpenAI\n__root__\n  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass  `openai_api_key` as a named parameter. (type=value_error)",
+          "output_type": "error",
+          "traceback": [
+            "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+            "\u001b[0;31mValidationError\u001b[0m                           Traceback (most recent call last)",
+            "Cell \u001b[0;32mIn[17], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m llm \u001b[38;5;241m=\u001b[39m \u001b[43mChatOpenAI\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtemperature\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m      2\u001b[0m agent_chain \u001b[38;5;241m=\u001b[39m initialize_agent(\n\u001b[1;32m      3\u001b[0m     tools,\n\u001b[1;32m      4\u001b[0m     llm,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m      7\u001b[0m     memory\u001b[38;5;241m=\u001b[39mmemory,\n\u001b[1;32m      8\u001b[0m )\n",
+            "File \u001b[0;32m~/Documents/projects/langchain/libs/langchain/langchain/load/serializable.py:74\u001b[0m, in \u001b[0;36mSerializable.__init__\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m     73\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 74\u001b[0m     \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     75\u001b[0m     \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lc_kwargs \u001b[38;5;241m=\u001b[39m kwargs\n",
+            "File \u001b[0;32m~/Documents/projects/langchain/.venv/lib/python3.9/site-packages/pydantic/main.py:341\u001b[0m, in \u001b[0;36mpydantic.main.BaseModel.__init__\u001b[0;34m()\u001b[0m\n",
+            "\u001b[0;31mValidationError\u001b[0m: 1 validation error for ChatOpenAI\n__root__\n  Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass  `openai_api_key` as a named parameter. (type=value_error)"
+          ]
+        }
+      ],
+      "source": [
+        "llm = ChatOpenAI(temperature=0)\n",
+        "agent_chain = initialize_agent(\n",
+        "    tools,\n",
+        "    llm,\n",
+        "    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,\n",
+        "    verbose=True,\n",
+        "    memory=memory,\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "952a3103",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "agent_chain.run(input=\"Hello!\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "54c4aaf4",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "agent_chain.run(input=\"Who owns Twitter?\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "f9013118",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "agent_chain.run(input=\"My name is Bob.\")"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "405e5315",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "agent_chain.run(input=\"Who am I?\")\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.11.3"
     }
-   ],
-   "source": [
-    "llm = ChatOpenAI(temperature=0)\n",
-    "agent_chain = initialize_agent(\n",
-    "    tools,\n",
-    "    llm,\n",
-    "    agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,\n",
-    "    verbose=True,\n",
-    "    memory=memory,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "952a3103",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "agent_chain.run(input=\"Hello!\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "54c4aaf4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "agent_chain.run(input=\"Who owns Twitter?\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f9013118",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "agent_chain.run(input=\"My name is Bob.\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "405e5315",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "agent_chain.run(input=\"Who am I?\")\n"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
   },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
+  "nbformat": 4,
+  "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/docs/extras/integrations/retrievers/kay.ipynb b/docs/extras/integrations/retrievers/kay.ipynb
new file mode 100644
index 0000000000..1175d11924
--- /dev/null
+++ b/docs/extras/integrations/retrievers/kay.ipynb
@@ -0,0 +1,207 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "263f914c-9d67-4316-8b3d-03c3b99ba9d8",
+   "metadata": {},
+   "source": [
+    "Kay.ai\n",
+    "=\n",
+    "\n",
+    "> Data API built for RAG 🕵️ We are curating the world's largest datasets as high-quality embeddings so your AI agents can retrieve context on the fly. Latest models, fast retrieval, and zero infra.\n",
+    "\n",
+    "This notebook shows you how to retrieve datasets supported by [Kay](https://kay.ai/). You can currently search SEC Filings and Press Releases of US companies. Visit [kay.ai](https://kay.ai) for the latest data drops. For any questions, join our [discord](https://discord.gg/hAnE4e5T6M) or [tweet at us](https://twitter.com/vishalrohra_)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fc507b8e-ea51-417c-93da-42bf998a1195",
+   "metadata": {},
+   "source": [
+    "Installation\n",
+    "=\n",
+    "\n",
+    "First you will need to install the [`kay` package](https://pypi.org/project/kay/). You will also need an API key: you can get one for free at [https://kay.ai](https://kay.ai/). Once you have an API key, you must set it as an environment variable `KAY_API_KEY`.\n",
+    "\n",
+    "`KayAiRetriever` has a static `.create()` factory method that takes the following arguments:\n",
+    "\n",
+    "* `dataset_id: string` required -- A Kay dataset id. This is a collection of data about a particular entity such as companies, people, or places. For example, try `\"company\"` \n",
+    "* `data_type: List[string]` optional -- This is a category within a  dataset based on its origin or format, such as ‘SEC Filings’, ‘Press Releases’, or ‘Reports’ within the “company” dataset. For example, try [\"10-K\", \"10-Q\", \"PressRelease\"] under the “company” dataset. If left empty, Kay will retrieve the most relevant context across all types.\n",
+    "* `num_contexts: int` optional, defaults to 6 -- The number of document chunks to retrieve on each call to `get_relevant_documents()`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c923bea0-585a-4f62-8662-efc167e8d793",
+   "metadata": {},
+   "source": [
+    "Examples\n",
+    "=\n",
+    "\n",
+    "Basic Retriever Usage\n",
+    "-"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f7b8c99c-0341-4f3c-912f-a11e98f7de71",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      " ········\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Setup API key\n",
+    "from getpass import getpass\n",
+    "KAY_API_KEY = getpass()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "b4d4d386-2a6b-4942-863e-9202f5a9f1d6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.retrievers import KayAiRetriever\n",
+    "import os\n",
+    "from kay.rag.retrievers import KayRetriever\n",
+    "os.environ[\"KAY_API_KEY\"] = KAY_API_KEY\n",
+    "retriever = KayAiRetriever.create(dataset_id=\"company\", data_types=[\"10-K\", \"10-Q\", \"PressRelease\"], num_contexts=3)\n",
+    "docs = retriever.get_relevant_documents(\"What were the biggest strategy changes and partnerships made by Roku in 2023??\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "04ee2d6b-c2ab-4e15-8a8b-afaf6ef8c0f6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[Document(page_content='Company Name: ROKU INC\\nCompany Industry: CABLE & OTHER PAY TELEVISION SERVICES\\nArticle Title: Roku and FreeWheel Announce Strategic Partnership to Bring Roku’s Leading Ad Tech to FreeWheel Customers\\nText: Additionally, eMarketer Link: https://cts.businesswire.com/ct/CT?id=smartlink&url=https%3A%2F%2Fwww.insiderintelligence.com%2Finsights%2Favod-more-than-50-percent-of-us-digital-video-viewers%2F&esheet=53451144&newsitemid=20230712907788&lan=en-US&anchor=eMarketer&index=4&md5=b64dea72bcf6b6379474462602781d83 projects 57% of U.S. digital video users will stream an advertising-based video on demand (AVOD) service this year.\\nHaving solutions aimed at driving greater interoperability and automation will help accelerate this growth.\\nKey highlights of this collaboration include:\\nStreamlined Integration: Roku has now integrated its demand application programming interface (dAPI) with FreeWheel s TV platform. Roku s demand API gives publishers direct, automatic and real-time access to more advertiser demand. This enhanced integration allows for streamlined ad operation workflows and better inventory quality control, both of which will improve publisher yield and revenue.\\nSeamless Data Targeting: Publishers can now use Roku platform signals to enable advertisers to target audiences and measure campaign performance without relying on cookies. Additionally, FreeWheel and Roku will rely on data clean room technology to enable the activation of additional data sets providing better measurement and monetization to publishers and agencies.', metadata={'_additional': {'id': '962b79e0-f9d1-43ae-9f7a-8a9b42bc7a9a'}, 'chunk_type': 'text', 'chunk_years_mentioned': [], 'company_name': 'ROKU INC', 'company_sic_code_description': 'CABLE & OTHER PAY TELEVISION SERVICES', 'data_source': 'PressRelease', 'data_source_link': 'https://www.nasdaq.com/press-release/roku-and-freewheel-announce-strategic-partnership-to-bring-rokus-leading-ad-tech-to', 'data_source_publish_date': '2023-07-12T00:00:00Z', 'data_source_uid': 'a46f309c-705d-3946-96db-87aa4e73261f', 'title': 'ROKU INC |  Roku and FreeWheel Announce Strategic Partnership to Bring Roku’s Leading Ad Tech to FreeWheel Customers'}),\n",
+       " Document(page_content='Company Name: ROKU INC \\n Company Industry: CABLE & OTHER PAY TELEVISION SERVICES \\n Form Title: 10-K 2022-FY \\n Form Section: Risk Factors \\n Text: nd the Note Regarding Forward Looking Statements.This section of this Annual Report generally discusses fiscal years 2022 and 2021 and year to year comparisons between those years.Discussions of fiscal year 2020 and year to year comparisons between fiscal years 2021 and 2020 that are not included in this Annual Report can be found in Management\\'s Discussion and Analysis of Financial Condition and Results of Operations in Part II, Item 7 of our Annual Report for the fiscal year ended December 31, 2021 filed with the SEC on February 18, 2022.Overview Effective as of the fourth quarter of fiscal 2022, we reorganized our reportable segments to better align with management\\'s reporting of information reviewed by the Chief Operating Decision Maker (\"CODM\") for each segment.We renamed our \"player\" segment to \"devices\" which now includes our licensing arrangements with service operators and licensed Roku TV partners in addition to sales of our streaming players, audio products, smart home products and Roku branded TVs that will be designed, made, and sold by us in 2023.Our historical segment information is recast to conform to our new presentation in our financial statements and accompanying notes included in Item 8 of this Annual Report.Our two reportable segments are the platform segment and the devices segment.', metadata={'_additional': {'id': 'a76c5fed-5d63-45a7-b63a-2c30e05140fc'}, 'chunk_type': 'text', 'chunk_years_mentioned': [2020, 2021, 2022, 2023], 'company_name': 'ROKU INC', 'company_sic_code_description': 'CABLE & OTHER PAY TELEVISION SERVICES', 'data_source': '10-K', 'data_source_link': 'https://www.sec.gov/Archives/edgar/data/1428439/000142843923000007', 'data_source_publish_date': '2022-01-01T00:00:00Z', 'data_source_uid': '0001428439-23-000007', 'title': 'ROKU INC |  10-K 2022-FY '}),\n",
+       " Document(page_content='Company Name: ROKU INC \\n Company Industry: CABLE & OTHER PAY TELEVISION SERVICES \\n Form Title: 10-Q 2023-Q1 \\n Form Section: Risk Factors \\n Text: Our current and potential partners include TV brands, cable and satellite companies, and telecommunication providers.Under these license arrangements, we generally have limited or no control over the amount and timing of resources these entities dedicate to the relationship.In the past, our licensed Roku TV partners have failed to meet their forecasts and anticipated market launch dates for distributing Roku TV models, and they may fail to meet their forecasts or such launches in the future.If our licensed Roku TV partners or service operator partners fail to meet their forecasts or such launches for distributing licensed streaming devices or choose to deploy competing streaming solutions within their product lines, our business may be harmed.We depend on a small number of content publishers for a majority of our streaming hours, and if we fail to maintain these relationships, our business could be harmed.*Historically, a small number of content publishers have accounted for a significant portion of the hours streamed on our platform.In the three months ended March 31, 2023, the top three streaming services represented over 50% of all hours streamed in the period.If, for any reason, we cease distributing channels that have historically streamed a large percentage of the aggregate streaming hours on our platform, our streaming hours, our active accounts, or Roku streaming device sales may be adversely affected, and our business may be harmed.', metadata={'_additional': {'id': '2a92b2bb-02a0-4e15-8b64-d7e04078a205'}, 'chunk_type': 'text', 'chunk_years_mentioned': [2023], 'company_name': 'ROKU INC', 'company_sic_code_description': 'CABLE & OTHER PAY TELEVISION SERVICES', 'data_source': '10-Q', 'data_source_link': 'https://www.sec.gov/Archives/edgar/data/1428439/000142843923000017', 'data_source_publish_date': '2023-01-01T00:00:00Z', 'data_source_uid': '0001428439-23-000017', 'title': 'ROKU INC |  10-Q 2023-Q1 '})]"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "21f6e9e5-478c-4b2c-9d61-f7a84f4d2f8f",
+   "metadata": {},
+   "source": [
+    "Usage in a chain\n",
+    "-"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "d1cba716-ab8d-4518-9196-43f17eb189dc",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      " ········\n"
+     ]
+    }
+   ],
+   "source": [
+    "OPENAI_API_KEY = getpass()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "79441f1f-fa06-452c-bcd6-160ad0debc6a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "0c504bcd-f6e0-4028-a797-b31fb4b6d027",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.chains import ConversationalRetrievalChain\n",
+    "\n",
+    "model = ChatOpenAI(model_name=\"gpt-3.5-turbo\")\n",
+    "qa = ConversationalRetrievalChain.from_llm(model, retriever=retriever)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "977f158b-38d3-4b5f-9379-7cdd09436327",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-> **Question**: What were the biggest strategy changes and partnerships made by Roku in 2023? \n",
+      "\n",
+      "**Answer**: In 2023, Roku made a strategic partnership with FreeWheel to bring Roku's leading ad tech to FreeWheel customers. This partnership aimed to drive greater interoperability and automation in the advertising-based video on demand (AVOD) space. Key highlights of this collaboration include streamlined integration of Roku's demand application programming interface (dAPI) with FreeWheel's TV platform, allowing for better inventory quality control and improved publisher yield and revenue. Additionally, publishers can now use Roku platform signals to enable advertisers to target audiences and measure campaign performance without relying on cookies. This partnership also involves the use of data clean room technology to enable the activation of additional data sets for better measurement and monetization for publishers and agencies. These partnerships and strategies aim to support Roku's growth in the AVOD market. \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "questions = [\n",
+    "    \"What were the biggest strategy changes and partnerships made by Roku in 2023?\"\n",
+    "    # \"Where is Wex making the most money in 2023?\",\n",
+    "]\n",
+    "chat_history = []\n",
+    "\n",
+    "for question in questions:\n",
+    "    result = qa({\"question\": question, \"chat_history\": chat_history})\n",
+    "    chat_history.append((question, result[\"answer\"]))\n",
+    "    print(f\"-> **Question**: {question} \\n\")\n",
+    "    print(f\"**Answer**: {result['answer']} \\n\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/extras/integrations/retrievers/pubmed.ipynb b/docs/extras/integrations/retrievers/pubmed.ipynb
index 336ee5fdd8..590538f6e8 100644
--- a/docs/extras/integrations/retrievers/pubmed.ipynb
+++ b/docs/extras/integrations/retrievers/pubmed.ipynb
@@ -81,7 +81,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.9.18"
   }
  },
  "nbformat": 4,
diff --git a/docs/extras/integrations/retrievers/sec_filings.ipynb b/docs/extras/integrations/retrievers/sec_filings.ipynb
new file mode 100644
index 0000000000..7b2f8c3255
--- /dev/null
+++ b/docs/extras/integrations/retrievers/sec_filings.ipynb
@@ -0,0 +1,165 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "263f914c-9d67-4316-8b3d-03c3b99ba9d8",
+   "metadata": {},
+   "source": [
+    "SEC filings data\n",
+    "=\n",
+    "\n",
+    "SEC filings data powered by [Kay.ai](https://kay.ai) and [Cybersyn](https://www.cybersyn.com/).\n",
+    "\n",
+    ">The SEC filing is a financial statement or other formal document submitted to the U.S. Securities and Exchange Commission (SEC). Public companies, certain insiders, and broker-dealers are required to make regular SEC filings. Investors and financial professionals rely on these filings for information about companies they are evaluating for investment purposes."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "fc507b8e-ea51-417c-93da-42bf998a1195",
+   "metadata": {},
+   "source": [
+    "Setup\n",
+    "=\n",
+    "\n",
+    "First you will need to install the `kay` package. You will also need an API key: you can get one for free at [https://kay.ai](https://kay.ai/). Once you have an API key, you must set it as an environment variable `KAY_API_KEY`.\n",
+    "\n",
+    "In this example we're going to use the `KayAiRetriever`. Take a look at the [kay notebook](/docs/integrations/retrievers/kay) for more detailed information for the parmeters that it accepts.`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c923bea0-585a-4f62-8662-efc167e8d793",
+   "metadata": {},
+   "source": [
+    "Examples\n",
+    "=\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "f7b8c99c-0341-4f3c-912f-a11e98f7de71",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdin",
+     "output_type": "stream",
+     "text": [
+      " ········\n",
+      " ········\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Setup API keys for Kay and OpenAI\n",
+    "from getpass import getpass\n",
+    "KAY_API_KEY = getpass()\n",
+    "OPENAI_API_KEY = getpass()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "04ee2d6b-c2ab-4e15-8a8b-afaf6ef8c0f6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ[\"KAY_API_KEY\"] = KAY_API_KEY\n",
+    "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "0c504bcd-f6e0-4028-a797-b31fb4b6d027",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.chains import ConversationalRetrievalChain\n",
+    "from langchain.chat_models import ChatOpenAI\n",
+    "from langchain.retrievers import KayAiRetriever\n",
+    "\n",
+    "model = ChatOpenAI(model_name=\"gpt-3.5-turbo\")\n",
+    "retriever = KayAiRetriever.create(dataset_id=\"company\", data_types=[\"10-K\", \"10-Q\"], num_contexts=6)\n",
+    "qa = ConversationalRetrievalChain.from_llm(model, retriever=retriever)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "977f158b-38d3-4b5f-9379-7cdd09436327",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "-> **Question**: What are patterns in Nvidia's spend over the past three quarters? \n",
+      "\n",
+      "**Answer**: Based on the provided information, here are the patterns in NVIDIA's spend over the past three quarters:\n",
+      "\n",
+      "1. Research and Development Expenses:\n",
+      "   - Q3 2022: Increased by 34% compared to Q3 2021.\n",
+      "   - Q1 2023: Increased by 40% compared to Q1 2022.\n",
+      "   - Q2 2022: Increased by 25% compared to Q2 2021.\n",
+      "   \n",
+      "   Overall, research and development expenses have been consistently increasing over the past three quarters.\n",
+      "\n",
+      "2. Sales, General and Administrative Expenses:\n",
+      "   - Q3 2022: Increased by 8% compared to Q3 2021.\n",
+      "   - Q1 2023: Increased by 14% compared to Q1 2022.\n",
+      "   - Q2 2022: Decreased by 16% compared to Q2 2021.\n",
+      "   \n",
+      "   The pattern for sales, general and administrative expenses is not as consistent, with some quarters showing an increase and others showing a decrease.\n",
+      "\n",
+      "3. Total Operating Expenses:\n",
+      "   - Q3 2022: Increased by 25% compared to Q3 2021.\n",
+      "   - Q1 2023: Increased by 113% compared to Q1 2022.\n",
+      "   - Q2 2022: Increased by 9% compared to Q2 2021.\n",
+      "   \n",
+      "   Total operating expenses have generally been increasing over the past three quarters, with a significant increase in Q1 2023.\n",
+      "\n",
+      "Overall, the pattern indicates a consistent increase in research and development expenses and total operating expenses, while sales, general and administrative expenses show some fluctuations. \n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "questions = [\n",
+    "    \"What are patterns in Nvidia's spend over the past three quarters?\",\n",
+    "    #\"What are some recent challenges faced by the renewable energy sector?\",\n",
+    "]\n",
+    "chat_history = []\n",
+    "\n",
+    "for question in questions:\n",
+    "    result = qa({\"question\": question, \"chat_history\": chat_history})\n",
+    "    chat_history.append((question, result[\"answer\"]))\n",
+    "    print(f\"-> **Question**: {question} \\n\")\n",
+    "    print(f\"**Answer**: {result['answer']} \\n\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.18"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/docs/extras/use_cases/web_scraping.ipynb b/docs/extras/use_cases/web_scraping.ipynb
index 07a571aaf2..a9aa4a32b1 100644
--- a/docs/extras/use_cases/web_scraping.ipynb
+++ b/docs/extras/use_cases/web_scraping.ipynb
@@ -1,663 +1,664 @@
 {
- "cells": [
-  {
-   "cell_type": "raw",
-   "id": "e254cf03-49fc-4051-a4df-3a8e4e7d2688",
-   "metadata": {},
-   "source": [
-    "---\n",
-    "sidebar_position: 1\n",
-    "title: Web scraping\n",
-    "---"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6605e7f7",
-   "metadata": {},
-   "source": [
-    "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/web_scraping.ipynb)\n",
-    "\n",
-    "## Use case\n",
-    "\n",
-    "[Web research](https://blog.langchain.dev/automating-web-research/) is one of the killer LLM applications:\n",
-    "\n",
-    "* Users have [highlighted it](https://twitter.com/GregKamradt/status/1679913813297225729?s=20) as one of his top desired AI tools. \n",
-    "* OSS repos like [gpt-researcher](https://github.com/assafelovic/gpt-researcher) are growing in popularity. \n",
-    " \n",
-    "![Image description](/img/web_scraping.png)\n",
-    " \n",
-    "## Overview\n",
-    "\n",
-    "Gathering content from the web has a few components:\n",
-    "\n",
-    "* `Search`: Query to url (e.g., using `GoogleSearchAPIWrapper`).\n",
-    "* `Loading`: Url to HTML  (e.g., using `AsyncHtmlLoader`, `AsyncChromiumLoader`, etc).\n",
-    "* `Transforming`: HTML to formatted text (e.g., using `HTML2Text` or `Beautiful Soup`).\n",
-    "\n",
-    "## Quickstart"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1803c182",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pip install -q openai langchain playwright beautifulsoup4\n",
-    "playwright install\n",
-    "\n",
-    "# Set env var OPENAI_API_KEY or load from a .env file:\n",
-    "# import dotenv\n",
-    "# dotenv.load_dotenv()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "50741083",
-   "metadata": {},
-   "source": [
-    "Scraping HTML content using a headless instance of Chromium.\n",
-    "\n",
-    "* The async nature of the scraping process is handled using Python's asyncio library.\n",
-    "* The actual interaction with the web pages is handled by Playwright."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "cd457cb1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.document_loaders import AsyncChromiumLoader\n",
-    "from langchain.document_transformers import BeautifulSoupTransformer\n",
-    "\n",
-    "# Load HTML\n",
-    "loader = AsyncChromiumLoader([\"https://www.wsj.com\"])\n",
-    "html = loader.load()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2a879806",
-   "metadata": {},
-   "source": [
-    "Scrape text content tags such as `<p>, <li>, <div>, and <a>` tags from the HTML content:\n",
-    "\n",
-    "* `<p>`: The paragraph tag. It defines a paragraph in HTML and is used to group together related sentences and/or phrases.\n",
-    " \n",
-    "* `<li>`: The list item tag. It is used within ordered (`<ol>`) and unordered (`<ul>`) lists to define individual items within the list.\n",
-    " \n",
-    "* `<div>`: The division tag. It is a block-level element used to group other inline or block-level elements.\n",
-    " \n",
-    "* `<a>`: The anchor tag. It is used to define hyperlinks.\n",
-    "\n",
-    "* `<span>`:  an inline container used to mark up a part of a text, or a part of a document. \n",
-    "\n",
-    "For many news websites (e.g., WSJ, CNN), headlines and summaries are all in `<span>` tags."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "141f206b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Transform\n",
-    "bs_transformer = BeautifulSoupTransformer()\n",
-    "docs_transformed = bs_transformer.transform_documents(html,tags_to_extract=[\"span\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "73ddb234",
-   "metadata": {},
-   "outputs": [
+  "cells": [
     {
-     "data": {
-      "text/plain": [
-       "'English EditionEnglish中文 (Chinese)日本語 (Japanese) More Other Products from WSJBuy Side from WSJWSJ ShopWSJ Wine Other Products from WSJ Search Quotes and Companies Search Quotes and Companies 0.15% 0.03% 0.12% -0.42% 4.102% -0.69% -0.25% -0.15% -1.82% 0.24% 0.19% -1.10% About Evan His Family Reflects His Reporting How You Can Help Write a Message Life in Detention Latest News Get Email Updates Four Americans Released From Iranian Prison The Americans will remain under house arrest until they are '"
+      "cell_type": "raw",
+      "id": "e254cf03-49fc-4051-a4df-3a8e4e7d2688",
+      "metadata": {},
+      "source": [
+        "---\n",
+        "sidebar_position: 1\n",
+        "title: Web scraping\n",
+        "---"
       ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Result\n",
-    "docs_transformed[0].page_content[0:500]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7d26d185",
-   "metadata": {},
-   "source": [
-    "These `Documents` now are staged for downstream usage in various LLM apps, as discussed below.\n",
-    "\n",
-    "## Loader\n",
-    "\n",
-    "### AsyncHtmlLoader\n",
-    "\n",
-    "The [AsyncHtmlLoader](docs/integrations/document_loaders/async_html) uses the `aiohttp` library to make asynchronous HTTP requests, suitable for simpler and lightweight scraping.\n",
-    "\n",
-    "### AsyncChromiumLoader\n",
-    "\n",
-    "The [AsyncChromiumLoader](docs/integrations/document_loaders/async_chromium) uses Playwright to launch a Chromium instance, which can handle JavaScript rendering and more complex web interactions.\n",
-    "\n",
-    "Chromium is one of the browsers supported by Playwright, a library used to control browser automation. \n",
-    "\n",
-    "Headless mode means that the browser is running without a graphical user interface, which is commonly used for web scraping."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8941e855",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.document_loaders import AsyncHtmlLoader\n",
-    "urls = [\"https://www.espn.com\",\"https://lilianweng.github.io/posts/2023-06-23-agent/\"]\n",
-    "loader = AsyncHtmlLoader(urls)\n",
-    "docs = loader.load()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e47f4bf0",
-   "metadata": {},
-   "source": [
-    "## Transformer\n",
-    "\n",
-    "### HTML2Text\n",
-    "\n",
-    "[HTML2Text](docs/integrations/document_transformers/html2text) provides a straightforward conversion of HTML content into plain text (with markdown-like formatting) without any specific tag manipulation. \n",
-    "\n",
-    "It's best suited for scenarios where the goal is to extract human-readable text without needing to manipulate specific HTML elements.\n",
-    "\n",
-    "### Beautiful Soup\n",
-    " \n",
-    "Beautiful Soup offers more fine-grained control over HTML content, enabling specific tag extraction, removal, and content cleaning. \n",
-    "\n",
-    "It's suited for cases where you want to extract specific information and clean up the HTML content according to your needs."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "99a7e2a8",
-   "metadata": {},
-   "outputs": [
+    },
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "Fetching pages: 100%|#############################################################################################################| 2/2 [00:00<00:00,  7.01it/s]\n"
-     ]
-    }
-   ],
-   "source": [
-    "from langchain.document_loaders import AsyncHtmlLoader\n",
-    "urls = [\"https://www.espn.com\", \"https://lilianweng.github.io/posts/2023-06-23-agent/\"]\n",
-    "loader = AsyncHtmlLoader(urls)\n",
-    "docs = loader.load()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "a2cd3e8d",
-   "metadata": {},
-   "outputs": [
+      "cell_type": "markdown",
+      "id": "6605e7f7",
+      "metadata": {},
+      "source": [
+        "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/web_scraping.ipynb)\n",
+        "\n",
+        "## Use case\n",
+        "\n",
+        "[Web research](https://blog.langchain.dev/automating-web-research/) is one of the killer LLM applications:\n",
+        "\n",
+        "* Users have [highlighted it](https://twitter.com/GregKamradt/status/1679913813297225729?s=20) as one of his top desired AI tools. \n",
+        "* OSS repos like [gpt-researcher](https://github.com/assafelovic/gpt-researcher) are growing in popularity. \n",
+        " \n",
+        "![Image description](/img/web_scraping.png)\n",
+        " \n",
+        "## Overview\n",
+        "\n",
+        "Gathering content from the web has a few components:\n",
+        "\n",
+        "* `Search`: Query to url (e.g., using `GoogleSearchAPIWrapper`).\n",
+        "* `Loading`: Url to HTML  (e.g., using `AsyncHtmlLoader`, `AsyncChromiumLoader`, etc).\n",
+        "* `Transforming`: HTML to formatted text (e.g., using `HTML2Text` or `Beautiful Soup`).\n",
+        "\n",
+        "## Quickstart"
+      ]
+    },
     {
-     "data": {
-      "text/plain": [
-       "\"Skip to main content  Skip to navigation\\n\\n<\\n\\n>\\n\\nMenu\\n\\n## ESPN\\n\\n  * Search\\n\\n  *   * scores\\n\\n  * NFL\\n  * MLB\\n  * NBA\\n  * NHL\\n  * Soccer\\n  * NCAAF\\n  * …\\n\\n    * Women's World Cup\\n    * LLWS\\n    * NCAAM\\n    * NCAAW\\n    * Sports Betting\\n    * Boxing\\n    * CFL\\n    * NCAA\\n    * Cricket\\n    * F1\\n    * Golf\\n    * Horse\\n    * MMA\\n    * NASCAR\\n    * NBA G League\\n    * Olympic Sports\\n    * PLL\\n    * Racing\\n    * RN BB\\n    * RN FB\\n    * Rugby\\n    * Tennis\\n    * WNBA\\n    * WWE\\n    * X Games\\n    * XFL\\n\\n  * More\""
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "1803c182",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "pip install -q openai langchain playwright beautifulsoup4\n",
+        "playwright install\n",
+        "\n",
+        "# Set env var OPENAI_API_KEY or load from a .env file:\n",
+        "# import dotenv\n",
+        "# dotenv.load_dotenv()"
       ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "from langchain.document_transformers import Html2TextTransformer\n",
-    "html2text = Html2TextTransformer()\n",
-    "docs_transformed = html2text.transform_documents(docs)\n",
-    "docs_transformed[0].page_content[0:500]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8aef9861",
-   "metadata": {},
-   "source": [
-    "## Scraping with extraction\n",
-    "\n",
-    "### LLM with function calling\n",
-    "\n",
-    "Web scraping is challenging for many reasons. \n",
-    "\n",
-    "One of them is the changing nature of modern websites' layouts and content, which requires modifying scraping scripts to accommodate the changes.\n",
-    "\n",
-    "Using Function (e.g., OpenAI) with an extraction chain, we avoid having to change your code constantly when websites change. \n",
-    "\n",
-    "We're using `gpt-3.5-turbo-0613` to guarantee access to OpenAI Functions feature (although this might be available to everyone by time of writing). \n",
-    "\n",
-    "We're also keeping `temperature` at `0` to keep randomness of the LLM down."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "52d49f6f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.chat_models import ChatOpenAI\n",
-    "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "fc5757ce",
-   "metadata": {},
-   "source": [
-    "### Define a schema\n",
-    "\n",
-    "Next, you define a schema to specify what kind of data you want to extract. \n",
-    "\n",
-    "Here, the key names matter as they tell the LLM what kind of information they want. \n",
-    "\n",
-    "So, be as detailed as possible. \n",
-    "\n",
-    "In this example, we want to scrape only news article's name and summary from The Wall Street Journal website."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "id": "95506f8e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.chains import create_extraction_chain\n",
-    "\n",
-    "schema = {\n",
-    "    \"properties\": {\n",
-    "        \"news_article_title\": {\"type\": \"string\"},\n",
-    "        \"news_article_summary\": {\"type\": \"string\"},\n",
-    "    },\n",
-    "    \"required\": [\"news_article_title\", \"news_article_summary\"],\n",
-    "}\n",
-    "\n",
-    "def extract(content: str, schema: dict):\n",
-    "    return create_extraction_chain(schema=schema, llm=llm).run(content)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "97f7de42",
-   "metadata": {},
-   "source": [
-    "### Run the web scraper w/ BeautifulSoup\n",
-    "\n",
-    "As shown above, we'll using `BeautifulSoupTransformer`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "id": "977560ba",
-   "metadata": {},
-   "outputs": [
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Extracting content with LLM\n",
-      "[{'news_article_summary': 'The Americans will remain under house arrest until '\n",
-      "                          'they are allowed to return to the U.S. in coming '\n",
-      "                          'weeks, following a monthslong diplomatic push by '\n",
-      "                          'the Biden administration.',\n",
-      "  'news_article_title': 'Four Americans Released From Iranian Prison'},\n",
-      " {'news_article_summary': 'Price pressures continued cooling last month, with '\n",
-      "                          'the CPI rising a mild 0.2% from June, likely '\n",
-      "                          'deterring the Federal Reserve from raising interest '\n",
-      "                          'rates at its September meeting.',\n",
-      "  'news_article_title': 'Cooler July Inflation Opens Door to Fed Pause on '\n",
-      "                        'Rates'},\n",
-      " {'news_article_summary': 'The company has decided to eliminate 27 of its 30 '\n",
-      "                          'clothing labels, such as Lark & Ro and Goodthreads, '\n",
-      "                          'as it works to fend off antitrust scrutiny and cut '\n",
-      "                          'costs.',\n",
-      "  'news_article_title': 'Amazon Cuts Dozens of House Brands'},\n",
-      " {'news_article_summary': 'President Biden’s order comes on top of a slowing '\n",
-      "                          'Chinese economy, Covid lockdowns and rising '\n",
-      "                          'tensions between the two powers.',\n",
-      "  'news_article_title': 'U.S. Investment Ban on China Poised to Deepen Divide'},\n",
-      " {'news_article_summary': 'The proposed trial date in the '\n",
-      "                          'election-interference case comes on the same day as '\n",
-      "                          'the former president’s not guilty plea on '\n",
-      "                          'additional Mar-a-Lago charges.',\n",
-      "  'news_article_title': 'Trump Should Be Tried in January, Prosecutors Tell '\n",
-      "                        'Judge'},\n",
-      " {'news_article_summary': 'The CEO who started in June says the platform has '\n",
-      "                          '“an entirely different road map” for the future.',\n",
-      "  'news_article_title': 'Yaccarino Says X Is Watching Threads but Has Its Own '\n",
-      "                        'Vision'},\n",
-      " {'news_article_summary': 'Students foot the bill for flagship state '\n",
-      "                          'universities that pour money into new buildings and '\n",
-      "                          'programs with little pushback.',\n",
-      "  'news_article_title': 'Colleges Spend Like There’s No Tomorrow. ‘These '\n",
-      "                        'Places Are Just Devouring Money.’'},\n",
-      " {'news_article_summary': 'Wildfires fanned by hurricane winds have torn '\n",
-      "                          'through parts of the Hawaiian island, devastating '\n",
-      "                          'the popular tourist town of Lahaina.',\n",
-      "  'news_article_title': 'Maui Wildfires Leave at Least 36 Dead'},\n",
-      " {'news_article_summary': 'After its large armored push stalled, Kyiv has '\n",
-      "                          'fallen back on the kind of tactics that brought it '\n",
-      "                          'success earlier in the war.',\n",
-      "  'news_article_title': 'Ukraine Uses Small-Unit Tactics to Retake Captured '\n",
-      "                        'Territory'},\n",
-      " {'news_article_summary': 'President Guillermo Lasso says the Aug. 20 election '\n",
-      "                          'will proceed, as the Andean country grapples with '\n",
-      "                          'rising drug gang violence.',\n",
-      "  'news_article_title': 'Ecuador Declares State of Emergency After '\n",
-      "                        'Presidential Hopeful Killed'},\n",
-      " {'news_article_summary': 'This year’s hurricane season, which typically runs '\n",
-      "                          'from June to the end of November, has been '\n",
-      "                          'difficult to predict, climate scientists said.',\n",
-      "  'news_article_title': 'Atlantic Hurricane Season Prediction Increased to '\n",
-      "                        '‘Above Normal,’ NOAA Says'},\n",
-      " {'news_article_summary': 'The NFL is raising the price of its NFL+ streaming '\n",
-      "                          'packages as it adds the NFL Network and RedZone.',\n",
-      "  'news_article_title': 'NFL to Raise Price of NFL+ Streaming Packages as It '\n",
-      "                        'Adds NFL Network, RedZone'},\n",
-      " {'news_article_summary': 'Russia is planning a moon mission as part of the '\n",
-      "                          'new space race.',\n",
-      "  'news_article_title': 'Russia’s Moon Mission and the New Space Race'},\n",
-      " {'news_article_summary': 'Tapestry’s $8.5 billion acquisition of Capri would '\n",
-      "                          'create a conglomerate with more than $12 billion in '\n",
-      "                          'annual sales, but it would still lack the '\n",
-      "                          'high-wattage labels and diversity that have fueled '\n",
-      "                          'LVMH’s success.',\n",
-      "  'news_article_title': \"Why the Coach and Kors Marriage Doesn't Scare LVMH\"},\n",
-      " {'news_article_summary': 'The Supreme Court has blocked Purdue Pharma’s $6 '\n",
-      "                          'billion Sackler opioid settlement.',\n",
-      "  'news_article_title': 'Supreme Court Blocks Purdue Pharma’s $6 Billion '\n",
-      "                        'Sackler Opioid Settlement'},\n",
-      " {'news_article_summary': 'The Social Security COLA is expected to rise in '\n",
-      "                          '2024, but not by a lot.',\n",
-      "  'news_article_title': 'Social Security COLA Expected to Rise in 2024, but '\n",
-      "                        'Not by a Lot'}]\n"
-     ]
-    }
-   ],
-   "source": [
-    "import pprint\n",
-    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
-    "\n",
-    "def scrape_with_playwright(urls, schema):\n",
-    "    \n",
-    "    loader = AsyncChromiumLoader(urls)\n",
-    "    docs = loader.load()\n",
-    "    bs_transformer = BeautifulSoupTransformer()\n",
-    "    docs_transformed = bs_transformer.transform_documents(docs,tags_to_extract=[\"span\"])\n",
-    "    print(\"Extracting content with LLM\")\n",
-    "    \n",
-    "    # Grab the first 1000 tokens of the site\n",
-    "    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, \n",
-    "                                                                    chunk_overlap=0)\n",
-    "    splits = splitter.split_documents(docs_transformed)\n",
-    "    \n",
-    "    # Process the first split \n",
-    "    extracted_content = extract(\n",
-    "        schema=schema, content=splits[0].page_content\n",
-    "    )\n",
-    "    pprint.pprint(extracted_content)\n",
-    "    return extracted_content\n",
-    "\n",
-    "urls = [\"https://www.wsj.com\"]\n",
-    "extracted_content = scrape_with_playwright(urls, schema=schema)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b08a8cef",
-   "metadata": {},
-   "source": [
-    "We can compare the headlines scraped to the page:\n",
-    "\n",
-    "![Image description](/img/wsj_page.png)\n",
-    "\n",
-    "Looking at the [LangSmith trace](https://smith.langchain.com/public/c3070198-5b13-419b-87bf-3821cdf34fa6/r), we can see what is going on under the hood:\n",
-    "\n",
-    "* It's following what is explained in the [extraction](docs/use_cases/extraction).\n",
-    "* We call the `information_extraction` function on the input text.\n",
-    "* It will attempt to populate the provided schema from the url content."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a5a6f11e",
-   "metadata": {},
-   "source": [
-    "## Research automation\n",
-    "\n",
-    "Related to scraping, we may want to answer specific questions using searched content.\n",
-    "\n",
-    "We can automate the process of [web research](https://blog.langchain.dev/automating-web-research/) using a retriever, such as the `WebResearchRetriever` ([docs](https://python.langchain.com/docs/modules/data_connection/retrievers/web_research)).\n",
-    "\n",
-    "![Image description](/img/web_research.png)\n",
-    "\n",
-    "Copy requirements [from here](https://github.com/langchain-ai/web-explorer/blob/main/requirements.txt):\n",
-    "\n",
-    "`pip install -r requirements.txt`\n",
-    " \n",
-    "Set `GOOGLE_CSE_ID` and `GOOGLE_API_KEY`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "id": "414f0d41",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from langchain.vectorstores import Chroma\n",
-    "from langchain.embeddings import OpenAIEmbeddings\n",
-    "from langchain.chat_models.openai import ChatOpenAI\n",
-    "from langchain.utilities import GoogleSearchAPIWrapper\n",
-    "from langchain.retrievers.web_research import WebResearchRetriever"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 41,
-   "id": "5d1ce098",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Vectorstore\n",
-    "vectorstore = Chroma(embedding_function=OpenAIEmbeddings(),persist_directory=\"./chroma_db_oai\")\n",
-    "\n",
-    "# LLM\n",
-    "llm = ChatOpenAI(temperature=0)\n",
-    "\n",
-    "# Search \n",
-    "search = GoogleSearchAPIWrapper()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6d808b9d",
-   "metadata": {},
-   "source": [
-    "Initialize retriever with the above tools to:\n",
-    "    \n",
-    "* Use an LLM to generate multiple relevant search queries (one LLM call)\n",
-    "* Execute a search for each query\n",
-    "* Choose the top K links per query  (multiple search calls in parallel)\n",
-    "* Load the information from all chosen links (scrape pages in parallel)\n",
-    "* Index those documents into a vectorstore\n",
-    "* Find the most relevant documents for each original generated search query"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 42,
-   "id": "e3e3a589",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Initialize\n",
-    "web_research_retriever = WebResearchRetriever.from_llm(\n",
-    "    vectorstore=vectorstore,\n",
-    "    llm=llm, \n",
-    "    search=search)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 44,
-   "id": "20655b74",
-   "metadata": {},
-   "outputs": [
+      "cell_type": "markdown",
+      "id": "50741083",
+      "metadata": {},
+      "source": [
+        "Scraping HTML content using a headless instance of Chromium.\n",
+        "\n",
+        "* The async nature of the scraping process is handled using Python's asyncio library.\n",
+        "* The actual interaction with the web pages is handled by Playwright."
+      ]
+    },
     {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "INFO:langchain.retrievers.web_research:Generating questions for Google Search ...\n",
-      "INFO:langchain.retrievers.web_research:Questions for Google Search (raw): {'question': 'How do LLM Powered Autonomous Agents work?', 'text': LineList(lines=['1. What is the functioning principle of LLM Powered Autonomous Agents?\\n', '2. How do LLM Powered Autonomous Agents operate?\\n'])}\n",
-      "INFO:langchain.retrievers.web_research:Questions for Google Search: ['1. What is the functioning principle of LLM Powered Autonomous Agents?\\n', '2. How do LLM Powered Autonomous Agents operate?\\n']\n",
-      "INFO:langchain.retrievers.web_research:Searching for relevat urls ...\n",
-      "INFO:langchain.retrievers.web_research:Searching for relevat urls ...\n",
-      "INFO:langchain.retrievers.web_research:Search results: [{'title': 'LLM Powered Autonomous Agents | Hacker News', 'link': 'https://news.ycombinator.com/item?id=36488871', 'snippet': 'Jun 26, 2023 ... Exactly. A temperature of 0 means you always pick the highest probability token (i.e. the \"max\" function), while a temperature of 1 means you\\xa0...'}]\n",
-      "INFO:langchain.retrievers.web_research:Searching for relevat urls ...\n",
-      "INFO:langchain.retrievers.web_research:Search results: [{'title': \"LLM Powered Autonomous Agents | Lil'Log\", 'link': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'snippet': 'Jun 23, 2023 ... Task decomposition can be done (1) by LLM with simple prompting like \"Steps for XYZ.\\\\n1.\" , \"What are the subgoals for achieving XYZ?\" , (2) by\\xa0...'}]\n",
-      "INFO:langchain.retrievers.web_research:New URLs to load: []\n",
-      "INFO:langchain.retrievers.web_research:Grabbing most relevant splits from urls...\n"
-     ]
+      "cell_type": "code",
+      "execution_count": 2,
+      "id": "cd457cb1",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from langchain.document_loaders import AsyncChromiumLoader\n",
+        "from langchain.document_transformers import BeautifulSoupTransformer\n",
+        "\n",
+        "# Load HTML\n",
+        "loader = AsyncChromiumLoader([\"https://www.wsj.com\"])\n",
+        "html = loader.load()"
+      ]
     },
     {
-     "data": {
-      "text/plain": [
-       "{'question': 'How do LLM Powered Autonomous Agents work?',\n",
-       " 'answer': \"LLM-powered autonomous agents work by using LLM as the agent's brain, complemented by several key components such as planning, memory, and tool use. In terms of planning, the agent breaks down large tasks into smaller subgoals and can reflect and refine its actions based on past experiences. Memory is divided into short-term memory, which is used for in-context learning, and long-term memory, which allows the agent to retain and recall information over extended periods. Tool use involves the agent calling external APIs for additional information. These agents have been used in various applications, including scientific discovery and generative agents simulation.\",\n",
-       " 'sources': ''}"
+      "cell_type": "markdown",
+      "id": "2a879806",
+      "metadata": {},
+      "source": [
+        "Scrape text content tags such as `<p>, <li>, <div>, and <a>` tags from the HTML content:\n",
+        "\n",
+        "* `<p>`: The paragraph tag. It defines a paragraph in HTML and is used to group together related sentences and/or phrases.\n",
+        " \n",
+        "* `<li>`: The list item tag. It is used within ordered (`<ol>`) and unordered (`<ul>`) lists to define individual items within the list.\n",
+        " \n",
+        "* `<div>`: The division tag. It is a block-level element used to group other inline or block-level elements.\n",
+        " \n",
+        "* `<a>`: The anchor tag. It is used to define hyperlinks.\n",
+        "\n",
+        "* `<span>`:  an inline container used to mark up a part of a text, or a part of a document. \n",
+        "\n",
+        "For many news websites (e.g., WSJ, CNN), headlines and summaries are all in `<span>` tags."
       ]
-     },
-     "execution_count": 44,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Run\n",
-    "import logging\n",
-    "logging.basicConfig()\n",
-    "logging.getLogger(\"langchain.retrievers.web_research\").setLevel(logging.INFO)\n",
-    "from langchain.chains import RetrievalQAWithSourcesChain\n",
-    "user_input = \"How do LLM Powered Autonomous Agents work?\"\n",
-    "qa_chain = RetrievalQAWithSourcesChain.from_chain_type(llm,retriever=web_research_retriever)\n",
-    "result = qa_chain({\"question\": user_input})\n",
-    "result"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Going deeper \n",
-    "\n",
-    "* Here's a [app](https://github.com/langchain-ai/web-explorer/tree/main) that wraps this retriver with a lighweight UI."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "312c399e",
-   "metadata": {},
-   "source": [
-    "## Question answering over a website\n",
-    "\n",
-    "To answer questions over a specific website, you can use Apify's [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor, which can deeply crawl websites such as documentation, knowledge bases, help centers, or blogs,\n",
-    "and extract text content from the web pages.\n",
-    "\n",
-    "In the example below, we will deeply crawl the Python documentation of LangChain's Chat LLM models and answer a question over it.\n",
-    "\n",
-    "First, install the requirements\n",
-    "`pip install apify-client openai langchain chromadb tiktoken`\n",
-    " \n",
-    "Next, set `OPENAI_API_KEY` and `APIFY_API_TOKEN` in your environment variables.\n",
-    "\n",
-    "The full code follows:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "id": "9b08da5e",
-   "metadata": {},
-   "outputs": [
+    },
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      " Yes, LangChain offers integration with OpenAI chat models. You can use the ChatOpenAI class to interact with OpenAI models.\n"
-     ]
+      "cell_type": "code",
+      "execution_count": 3,
+      "id": "141f206b",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Transform\n",
+        "bs_transformer = BeautifulSoupTransformer()\n",
+        "docs_transformed = bs_transformer.transform_documents(html,tags_to_extract=[\"span\"])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "id": "73ddb234",
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "'English EditionEnglish\u4e2d\u6587 (Chinese)\u65e5\u672c\u8a9e (Japanese) More Other Products from WSJBuy Side from WSJWSJ ShopWSJ Wine Other Products from WSJ Search Quotes and Companies Search Quotes and Companies 0.15% 0.03% 0.12% -0.42% 4.102% -0.69% -0.25% -0.15% -1.82% 0.24% 0.19% -1.10% About Evan His Family Reflects His Reporting How You Can Help Write a Message Life in Detention Latest News Get Email Updates Four Americans Released From Iranian Prison The Americans will remain under house arrest until they are '"
+            ]
+          },
+          "execution_count": 4,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "# Result\n",
+        "docs_transformed[0].page_content[0:500]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "7d26d185",
+      "metadata": {},
+      "source": [
+        "These `Documents` now are staged for downstream usage in various LLM apps, as discussed below.\n",
+        "\n",
+        "## Loader\n",
+        "\n",
+        "### AsyncHtmlLoader\n",
+        "\n",
+        "The [AsyncHtmlLoader](docs/integrations/document_loaders/async_html) uses the `aiohttp` library to make asynchronous HTTP requests, suitable for simpler and lightweight scraping.\n",
+        "\n",
+        "### AsyncChromiumLoader\n",
+        "\n",
+        "The [AsyncChromiumLoader](docs/integrations/document_loaders/async_chromium) uses Playwright to launch a Chromium instance, which can handle JavaScript rendering and more complex web interactions.\n",
+        "\n",
+        "Chromium is one of the browsers supported by Playwright, a library used to control browser automation. \n",
+        "\n",
+        "Headless mode means that the browser is running without a graphical user interface, which is commonly used for web scraping."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "id": "8941e855",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from langchain.document_loaders import AsyncHtmlLoader\n",
+        "urls = [\"https://www.espn.com\",\"https://lilianweng.github.io/posts/2023-06-23-agent/\"]\n",
+        "loader = AsyncHtmlLoader(urls)\n",
+        "docs = loader.load()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "e47f4bf0",
+      "metadata": {},
+      "source": [
+        "## Transformer\n",
+        "\n",
+        "### HTML2Text\n",
+        "\n",
+        "[HTML2Text](docs/integrations/document_transformers/html2text) provides a straightforward conversion of HTML content into plain text (with markdown-like formatting) without any specific tag manipulation. \n",
+        "\n",
+        "It's best suited for scenarios where the goal is to extract human-readable text without needing to manipulate specific HTML elements.\n",
+        "\n",
+        "### Beautiful Soup\n",
+        " \n",
+        "Beautiful Soup offers more fine-grained control over HTML content, enabling specific tag extraction, removal, and content cleaning. \n",
+        "\n",
+        "It's suited for cases where you want to extract specific information and clean up the HTML content according to your needs."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "id": "99a7e2a8",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "Fetching pages: 100%|#############################################################################################################| 2/2 [00:00<00:00,  7.01it/s]\n"
+          ]
+        }
+      ],
+      "source": [
+        "from langchain.document_loaders import AsyncHtmlLoader\n",
+        "urls = [\"https://www.espn.com\", \"https://lilianweng.github.io/posts/2023-06-23-agent/\"]\n",
+        "loader = AsyncHtmlLoader(urls)\n",
+        "docs = loader.load()"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "id": "a2cd3e8d",
+      "metadata": {},
+      "outputs": [
+        {
+          "data": {
+            "text/plain": [
+              "\"Skip to main content  Skip to navigation\\n\\n<\\n\\n>\\n\\nMenu\\n\\n## ESPN\\n\\n  * Search\\n\\n  *   * scores\\n\\n  * NFL\\n  * MLB\\n  * NBA\\n  * NHL\\n  * Soccer\\n  * NCAAF\\n  * \u2026\\n\\n    * Women's World Cup\\n    * LLWS\\n    * NCAAM\\n    * NCAAW\\n    * Sports Betting\\n    * Boxing\\n    * CFL\\n    * NCAA\\n    * Cricket\\n    * F1\\n    * Golf\\n    * Horse\\n    * MMA\\n    * NASCAR\\n    * NBA G League\\n    * Olympic Sports\\n    * PLL\\n    * Racing\\n    * RN BB\\n    * RN FB\\n    * Rugby\\n    * Tennis\\n    * WNBA\\n    * WWE\\n    * X Games\\n    * XFL\\n\\n  * More\""
+            ]
+          },
+          "execution_count": 7,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "from langchain.document_transformers import Html2TextTransformer\n",
+        "html2text = Html2TextTransformer()\n",
+        "docs_transformed = html2text.transform_documents(docs)\n",
+        "docs_transformed[0].page_content[0:500]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "8aef9861",
+      "metadata": {},
+      "source": [
+        "## Scraping with extraction\n",
+        "\n",
+        "### LLM with function calling\n",
+        "\n",
+        "Web scraping is challenging for many reasons. \n",
+        "\n",
+        "One of them is the changing nature of modern websites' layouts and content, which requires modifying scraping scripts to accommodate the changes.\n",
+        "\n",
+        "Using Function (e.g., OpenAI) with an extraction chain, we avoid having to change your code constantly when websites change. \n",
+        "\n",
+        "We're using `gpt-3.5-turbo-0613` to guarantee access to OpenAI Functions feature (although this might be available to everyone by time of writing). \n",
+        "\n",
+        "We're also keeping `temperature` at `0` to keep randomness of the LLM down."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "id": "52d49f6f",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from langchain.chat_models import ChatOpenAI\n",
+        "llm = ChatOpenAI(temperature=0, model=\"gpt-3.5-turbo-0613\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "fc5757ce",
+      "metadata": {},
+      "source": [
+        "### Define a schema\n",
+        "\n",
+        "Next, you define a schema to specify what kind of data you want to extract. \n",
+        "\n",
+        "Here, the key names matter as they tell the LLM what kind of information they want. \n",
+        "\n",
+        "So, be as detailed as possible. \n",
+        "\n",
+        "In this example, we want to scrape only news article's name and summary from The Wall Street Journal website."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "id": "95506f8e",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from langchain.chains import create_extraction_chain\n",
+        "\n",
+        "schema = {\n",
+        "    \"properties\": {\n",
+        "        \"news_article_title\": {\"type\": \"string\"},\n",
+        "        \"news_article_summary\": {\"type\": \"string\"},\n",
+        "    },\n",
+        "    \"required\": [\"news_article_title\", \"news_article_summary\"],\n",
+        "}\n",
+        "\n",
+        "def extract(content: str, schema: dict):\n",
+        "    return create_extraction_chain(schema=schema, llm=llm).run(content)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "97f7de42",
+      "metadata": {},
+      "source": [
+        "### Run the web scraper w/ BeautifulSoup\n",
+        "\n",
+        "As shown above, we'll using `BeautifulSoupTransformer`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "id": "977560ba",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Extracting content with LLM\n",
+            "[{'news_article_summary': 'The Americans will remain under house arrest until '\n",
+            "                          'they are allowed to return to the U.S. in coming '\n",
+            "                          'weeks, following a monthslong diplomatic push by '\n",
+            "                          'the Biden administration.',\n",
+            "  'news_article_title': 'Four Americans Released From Iranian Prison'},\n",
+            " {'news_article_summary': 'Price pressures continued cooling last month, with '\n",
+            "                          'the CPI rising a mild 0.2% from June, likely '\n",
+            "                          'deterring the Federal Reserve from raising interest '\n",
+            "                          'rates at its September meeting.',\n",
+            "  'news_article_title': 'Cooler July Inflation Opens Door to Fed Pause on '\n",
+            "                        'Rates'},\n",
+            " {'news_article_summary': 'The company has decided to eliminate 27 of its 30 '\n",
+            "                          'clothing labels, such as Lark & Ro and Goodthreads, '\n",
+            "                          'as it works to fend off antitrust scrutiny and cut '\n",
+            "                          'costs.',\n",
+            "  'news_article_title': 'Amazon Cuts Dozens of House Brands'},\n",
+            " {'news_article_summary': 'President Biden\u2019s order comes on top of a slowing '\n",
+            "                          'Chinese economy, Covid lockdowns and rising '\n",
+            "                          'tensions between the two powers.',\n",
+            "  'news_article_title': 'U.S. Investment Ban on China Poised to Deepen Divide'},\n",
+            " {'news_article_summary': 'The proposed trial date in the '\n",
+            "                          'election-interference case comes on the same day as '\n",
+            "                          'the former president\u2019s not guilty plea on '\n",
+            "                          'additional Mar-a-Lago charges.',\n",
+            "  'news_article_title': 'Trump Should Be Tried in January, Prosecutors Tell '\n",
+            "                        'Judge'},\n",
+            " {'news_article_summary': 'The CEO who started in June says the platform has '\n",
+            "                          '\u201can entirely different road map\u201d for the future.',\n",
+            "  'news_article_title': 'Yaccarino Says X Is Watching Threads but Has Its Own '\n",
+            "                        'Vision'},\n",
+            " {'news_article_summary': 'Students foot the bill for flagship state '\n",
+            "                          'universities that pour money into new buildings and '\n",
+            "                          'programs with little pushback.',\n",
+            "  'news_article_title': 'Colleges Spend Like There\u2019s No Tomorrow. \u2018These '\n",
+            "                        'Places Are Just Devouring Money.\u2019'},\n",
+            " {'news_article_summary': 'Wildfires fanned by hurricane winds have torn '\n",
+            "                          'through parts of the Hawaiian island, devastating '\n",
+            "                          'the popular tourist town of Lahaina.',\n",
+            "  'news_article_title': 'Maui Wildfires Leave at Least 36 Dead'},\n",
+            " {'news_article_summary': 'After its large armored push stalled, Kyiv has '\n",
+            "                          'fallen back on the kind of tactics that brought it '\n",
+            "                          'success earlier in the war.',\n",
+            "  'news_article_title': 'Ukraine Uses Small-Unit Tactics to Retake Captured '\n",
+            "                        'Territory'},\n",
+            " {'news_article_summary': 'President Guillermo Lasso says the Aug. 20 election '\n",
+            "                          'will proceed, as the Andean country grapples with '\n",
+            "                          'rising drug gang violence.',\n",
+            "  'news_article_title': 'Ecuador Declares State of Emergency After '\n",
+            "                        'Presidential Hopeful Killed'},\n",
+            " {'news_article_summary': 'This year\u2019s hurricane season, which typically runs '\n",
+            "                          'from June to the end of November, has been '\n",
+            "                          'difficult to predict, climate scientists said.',\n",
+            "  'news_article_title': 'Atlantic Hurricane Season Prediction Increased to '\n",
+            "                        '\u2018Above Normal,\u2019 NOAA Says'},\n",
+            " {'news_article_summary': 'The NFL is raising the price of its NFL+ streaming '\n",
+            "                          'packages as it adds the NFL Network and RedZone.',\n",
+            "  'news_article_title': 'NFL to Raise Price of NFL+ Streaming Packages as It '\n",
+            "                        'Adds NFL Network, RedZone'},\n",
+            " {'news_article_summary': 'Russia is planning a moon mission as part of the '\n",
+            "                          'new space race.',\n",
+            "  'news_article_title': 'Russia\u2019s Moon Mission and the New Space Race'},\n",
+            " {'news_article_summary': 'Tapestry\u2019s $8.5 billion acquisition of Capri would '\n",
+            "                          'create a conglomerate with more than $12 billion in '\n",
+            "                          'annual sales, but it would still lack the '\n",
+            "                          'high-wattage labels and diversity that have fueled '\n",
+            "                          'LVMH\u2019s success.',\n",
+            "  'news_article_title': \"Why the Coach and Kors Marriage Doesn't Scare LVMH\"},\n",
+            " {'news_article_summary': 'The Supreme Court has blocked Purdue Pharma\u2019s $6 '\n",
+            "                          'billion Sackler opioid settlement.',\n",
+            "  'news_article_title': 'Supreme Court Blocks Purdue Pharma\u2019s $6 Billion '\n",
+            "                        'Sackler Opioid Settlement'},\n",
+            " {'news_article_summary': 'The Social Security COLA is expected to rise in '\n",
+            "                          '2024, but not by a lot.',\n",
+            "  'news_article_title': 'Social Security COLA Expected to Rise in 2024, but '\n",
+            "                        'Not by a Lot'}]\n"
+          ]
+        }
+      ],
+      "source": [
+        "import pprint\n",
+        "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
+        "\n",
+        "def scrape_with_playwright(urls, schema):\n",
+        "    \n",
+        "    loader = AsyncChromiumLoader(urls)\n",
+        "    docs = loader.load()\n",
+        "    bs_transformer = BeautifulSoupTransformer()\n",
+        "    docs_transformed = bs_transformer.transform_documents(docs,tags_to_extract=[\"span\"])\n",
+        "    print(\"Extracting content with LLM\")\n",
+        "    \n",
+        "    # Grab the first 1000 tokens of the site\n",
+        "    splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=1000, \n",
+        "                                                                    chunk_overlap=0)\n",
+        "    splits = splitter.split_documents(docs_transformed)\n",
+        "    \n",
+        "    # Process the first split \n",
+        "    extracted_content = extract(\n",
+        "        schema=schema, content=splits[0].page_content\n",
+        "    )\n",
+        "    pprint.pprint(extracted_content)\n",
+        "    return extracted_content\n",
+        "\n",
+        "urls = [\"https://www.wsj.com\"]\n",
+        "extracted_content = scrape_with_playwright(urls, schema=schema)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "b08a8cef",
+      "metadata": {},
+      "source": [
+        "We can compare the headlines scraped to the page:\n",
+        "\n",
+        "![Image description](/img/wsj_page.png)\n",
+        "\n",
+        "Looking at the [LangSmith trace](https://smith.langchain.com/public/c3070198-5b13-419b-87bf-3821cdf34fa6/r), we can see what is going on under the hood:\n",
+        "\n",
+        "* It's following what is explained in the [extraction](docs/use_cases/extraction).\n",
+        "* We call the `information_extraction` function on the input text.\n",
+        "* It will attempt to populate the provided schema from the url content."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "a5a6f11e",
+      "metadata": {},
+      "source": [
+        "## Research automation\n",
+        "\n",
+        "Related to scraping, we may want to answer specific questions using searched content.\n",
+        "\n",
+        "We can automate the process of [web research](https://blog.langchain.dev/automating-web-research/) using a retriever, such as the `WebResearchRetriever` ([docs](https://python.langchain.com/docs/modules/data_connection/retrievers/web_research)).\n",
+        "\n",
+        "![Image description](/img/web_research.png)\n",
+        "\n",
+        "Copy requirements [from here](https://github.com/langchain-ai/web-explorer/blob/main/requirements.txt):\n",
+        "\n",
+        "`pip install -r requirements.txt`\n",
+        " \n",
+        "Set `GOOGLE_CSE_ID` and `GOOGLE_API_KEY`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "id": "414f0d41",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "from langchain.vectorstores import Chroma\n",
+        "from langchain.embeddings import OpenAIEmbeddings\n",
+        "from langchain.chat_models.openai import ChatOpenAI\n",
+        "from langchain.utilities import GoogleSearchAPIWrapper\n",
+        "from langchain.retrievers.web_research import WebResearchRetriever"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 41,
+      "id": "5d1ce098",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Vectorstore\n",
+        "vectorstore = Chroma(embedding_function=OpenAIEmbeddings(),persist_directory=\"./chroma_db_oai\")\n",
+        "\n",
+        "# LLM\n",
+        "llm = ChatOpenAI(temperature=0)\n",
+        "\n",
+        "# Search \n",
+        "search = GoogleSearchAPIWrapper()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "id": "6d808b9d",
+      "metadata": {},
+      "source": [
+        "Initialize retriever with the above tools to:\n",
+        "    \n",
+        "* Use an LLM to generate multiple relevant search queries (one LLM call)\n",
+        "* Execute a search for each query\n",
+        "* Choose the top K links per query  (multiple search calls in parallel)\n",
+        "* Load the information from all chosen links (scrape pages in parallel)\n",
+        "* Index those documents into a vectorstore\n",
+        "* Find the most relevant documents for each original generated search query"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 42,
+      "id": "e3e3a589",
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "# Initialize\n",
+        "web_research_retriever = WebResearchRetriever.from_llm(\n",
+        "    vectorstore=vectorstore,\n",
+        "    llm=llm, \n",
+        "    search=search)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 44,
+      "id": "20655b74",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "INFO:langchain.retrievers.web_research:Generating questions for Google Search ...\n",
+            "INFO:langchain.retrievers.web_research:Questions for Google Search (raw): {'question': 'How do LLM Powered Autonomous Agents work?', 'text': LineList(lines=['1. What is the functioning principle of LLM Powered Autonomous Agents?\\n', '2. How do LLM Powered Autonomous Agents operate?\\n'])}\n",
+            "INFO:langchain.retrievers.web_research:Questions for Google Search: ['1. What is the functioning principle of LLM Powered Autonomous Agents?\\n', '2. How do LLM Powered Autonomous Agents operate?\\n']\n",
+            "INFO:langchain.retrievers.web_research:Searching for relevat urls ...\n",
+            "INFO:langchain.retrievers.web_research:Searching for relevat urls ...\n",
+            "INFO:langchain.retrievers.web_research:Search results: [{'title': 'LLM Powered Autonomous Agents | Hacker News', 'link': 'https://news.ycombinator.com/item?id=36488871', 'snippet': 'Jun 26, 2023 ... Exactly. A temperature of 0 means you always pick the highest probability token (i.e. the \"max\" function), while a temperature of 1 means you\\xa0...'}]\n",
+            "INFO:langchain.retrievers.web_research:Searching for relevat urls ...\n",
+            "INFO:langchain.retrievers.web_research:Search results: [{'title': \"LLM Powered Autonomous Agents | Lil'Log\", 'link': 'https://lilianweng.github.io/posts/2023-06-23-agent/', 'snippet': 'Jun 23, 2023 ... Task decomposition can be done (1) by LLM with simple prompting like \"Steps for XYZ.\\\\n1.\" , \"What are the subgoals for achieving XYZ?\" , (2) by\\xa0...'}]\n",
+            "INFO:langchain.retrievers.web_research:New URLs to load: []\n",
+            "INFO:langchain.retrievers.web_research:Grabbing most relevant splits from urls...\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "{'question': 'How do LLM Powered Autonomous Agents work?',\n",
+              " 'answer': \"LLM-powered autonomous agents work by using LLM as the agent's brain, complemented by several key components such as planning, memory, and tool use. In terms of planning, the agent breaks down large tasks into smaller subgoals and can reflect and refine its actions based on past experiences. Memory is divided into short-term memory, which is used for in-context learning, and long-term memory, which allows the agent to retain and recall information over extended periods. Tool use involves the agent calling external APIs for additional information. These agents have been used in various applications, including scientific discovery and generative agents simulation.\",\n",
+              " 'sources': ''}"
+            ]
+          },
+          "execution_count": 44,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "# Run\n",
+        "import logging\n",
+        "logging.basicConfig()\n",
+        "logging.getLogger(\"langchain.retrievers.web_research\").setLevel(logging.INFO)\n",
+        "from langchain.chains import RetrievalQAWithSourcesChain\n",
+        "user_input = \"How do LLM Powered Autonomous Agents work?\"\n",
+        "qa_chain = RetrievalQAWithSourcesChain.from_chain_type(llm,retriever=web_research_retriever)\n",
+        "result = qa_chain({\"question\": user_input})\n",
+        "result"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {},
+      "source": [
+        "### Going deeper \n",
+        "\n",
+        "* Here's a [app](https://github.com/langchain-ai/web-explorer/tree/main) that wraps this retriver with a lighweight UI."
+      ],
+      "id": "7a940df1"
+    },
+    {
+      "cell_type": "markdown",
+      "id": "312c399e",
+      "metadata": {},
+      "source": [
+        "## Question answering over a website\n",
+        "\n",
+        "To answer questions over a specific website, you can use Apify's [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor, which can deeply crawl websites such as documentation, knowledge bases, help centers, or blogs,\n",
+        "and extract text content from the web pages.\n",
+        "\n",
+        "In the example below, we will deeply crawl the Python documentation of LangChain's Chat LLM models and answer a question over it.\n",
+        "\n",
+        "First, install the requirements\n",
+        "`pip install apify-client openai langchain chromadb tiktoken`\n",
+        " \n",
+        "Next, set `OPENAI_API_KEY` and `APIFY_API_TOKEN` in your environment variables.\n",
+        "\n",
+        "The full code follows:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 2,
+      "id": "9b08da5e",
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            " Yes, LangChain offers integration with OpenAI chat models. You can use the ChatOpenAI class to interact with OpenAI models.\n"
+          ]
+        }
+      ],
+      "source": [
+        "from langchain.docstore.document import Document\n",
+        "from langchain.indexes import VectorstoreIndexCreator\n",
+        "from langchain.utilities import ApifyWrapper\n",
+        "\n",
+        "apify = ApifyWrapper()\n",
+        "# Call the Actor to obtain text from the crawled webpages\n",
+        "loader = apify.call_actor(\n",
+        "    actor_id=\"apify/website-content-crawler\",\n",
+        "    run_input={\"startUrls\": [{\"url\": \"https://python.langchain.com/docs/integrations/chat/\"}]},\n",
+        "    dataset_mapping_function=lambda item: Document(\n",
+        "        page_content=item[\"text\"] or \"\", metadata={\"source\": item[\"url\"]}\n",
+        "    ),\n",
+        ")\n",
+        "\n",
+        "# Create a vector store based on the crawled data\n",
+        "index = VectorstoreIndexCreator().from_loaders([loader])\n",
+        "\n",
+        "# Query the vector store\n",
+        "query = \"Are any OpenAI chat models integrated in LangChain?\"\n",
+        "result = index.query(query)\n",
+        "print(result)"
+      ]
+    }
+  ],
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python 3 (ipykernel)",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.16"
     }
-   ],
-   "source": [
-    "from langchain.docstore.document import Document\n",
-    "from langchain.indexes import VectorstoreIndexCreator\n",
-    "from langchain.utilities import ApifyWrapper\n",
-    "\n",
-    "apify = ApifyWrapper()\n",
-    "# Call the Actor to obtain text from the crawled webpages\n",
-    "loader = apify.call_actor(\n",
-    "    actor_id=\"apify/website-content-crawler\",\n",
-    "    run_input={\"startUrls\": [{\"url\": \"https://python.langchain.com/docs/integrations/chat/\"}]},\n",
-    "    dataset_mapping_function=lambda item: Document(\n",
-    "        page_content=item[\"text\"] or \"\", metadata={\"source\": item[\"url\"]}\n",
-    "    ),\n",
-    ")\n",
-    "\n",
-    "# Create a vector store based on the crawled data\n",
-    "index = VectorstoreIndexCreator().from_loaders([loader])\n",
-    "\n",
-    "# Query the vector store\n",
-    "query = \"Are any OpenAI chat models integrated in LangChain?\"\n",
-    "result = index.query(query)\n",
-    "print(result)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
   },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.16"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
+  "nbformat": 4,
+  "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/libs/langchain/langchain/retrievers/__init__.py b/libs/langchain/langchain/retrievers/__init__.py
index c666d9103f..372093dc10 100644
--- a/libs/langchain/langchain/retrievers/__init__.py
+++ b/libs/langchain/langchain/retrievers/__init__.py
@@ -30,6 +30,7 @@ from langchain.retrievers.ensemble import EnsembleRetriever
 from langchain.retrievers.google_cloud_enterprise_search import (
     GoogleCloudEnterpriseSearchRetriever,
 )
+from langchain.retrievers.kay import KayAiRetriever
 from langchain.retrievers.kendra import AmazonKendraRetriever
 from langchain.retrievers.knn import KNNRetriever
 from langchain.retrievers.llama_index import (
@@ -68,6 +69,7 @@ __all__ = [
     "ChaindeskRetriever",
     "ElasticSearchBM25Retriever",
     "GoogleCloudEnterpriseSearchRetriever",
+    "KayAiRetriever",
     "KNNRetriever",
     "LlamaIndexGraphRetriever",
     "LlamaIndexRetriever",
diff --git a/libs/langchain/langchain/retrievers/kay.py b/libs/langchain/langchain/retrievers/kay.py
new file mode 100644
index 0000000000..47e0471c15
--- /dev/null
+++ b/libs/langchain/langchain/retrievers/kay.py
@@ -0,0 +1,59 @@
+from __future__ import annotations
+
+from typing import Any, List
+
+from langchain.callbacks.manager import CallbackManagerForRetrieverRun
+from langchain.schema import BaseRetriever, Document
+
+
+class KayAiRetriever(BaseRetriever):
+    """
+    Retriever for Kay.ai datasets.
+
+    To work properly, expects you to have KAY_API_KEY env variable set.
+    You can get one for free at https://kay.ai/.
+    """
+
+    client: Any
+    num_contexts: int
+
+    @classmethod
+    def create(
+        cls,
+        dataset_id: str,
+        data_types: List[str],
+        num_contexts: int = 6,
+    ) -> KayAiRetriever:
+        """
+        Create a KayRetriever given a Kay dataset id and a list of datasources.
+
+        Args:
+            dataset_id: A dataset id category in Kay, like "company"
+            data_types: A list of datasources present within a dataset. For
+                "company" the corresponding datasources could be
+                ["10-K", "10-Q", "8-K", "PressRelease"].
+            num_contexts: The number of documents to retrieve on each query.
+                Defaults to 6.
+        """
+        try:
+            from kay.rag.retrievers import KayRetriever
+        except ImportError:
+            raise ImportError(
+                "Could not import kay python package. Please install it with "
+                "`pip install kay`.",
+            )
+
+        client = KayRetriever(dataset_id, data_types)
+        return cls(client=client, num_contexts=num_contexts)
+
+    def _get_relevant_documents(
+        self, query: str, *, run_manager: CallbackManagerForRetrieverRun
+    ) -> List[Document]:
+        ctxs = self.client.query(query=query, num_context=self.num_contexts)
+        docs = []
+        for ctx in ctxs:
+            page_content = ctx.pop("chunk_embed_text", None)
+            if page_content is None:
+                continue
+            docs.append(Document(page_content=page_content, metadata={**ctx}))
+        return docs
diff --git a/libs/langchain/tests/integration_tests/retrievers/test_kay.py b/libs/langchain/tests/integration_tests/retrievers/test_kay.py
new file mode 100644
index 0000000000..84c511fd6f
--- /dev/null
+++ b/libs/langchain/tests/integration_tests/retrievers/test_kay.py
@@ -0,0 +1,24 @@
+"""Integration test for Kay.ai API Wrapper."""
+import pytest
+
+from langchain.retrievers import KayAiRetriever
+from langchain.schema import Document
+
+
+@pytest.mark.requires("kay")
+def test_kay_retriever() -> None:
+    retriever = KayAiRetriever.create(
+        dataset_id="company",
+        data_types=["10-K", "10-Q", "8-K", "PressRelease"],
+        num_contexts=3,
+    )
+    docs = retriever.get_relevant_documents(
+        "What were the biggest strategy changes and partnerships made by Roku "
+        "in 2023?",
+    )
+    assert len(docs) == 3
+    for doc in docs:
+        assert isinstance(doc, Document)
+        assert doc.page_content
+        assert doc.metadata
+        assert len(list(doc.metadata.items())) > 0