From 89ef440c14634e74bae069925ef75d3ff4b12445 Mon Sep 17 00:00:00 2001 From: Palau Date: Mon, 25 Sep 2023 16:10:13 -0400 Subject: [PATCH] Kay retriever (#10657) - **Description**: Adding retrievers for [kay.ai](https://kay.ai) and SEC filings powered by Kay and Cybersyn. Kay provides context as a service: it's an API built for RAG. - **Issue**: N/A - **Dependencies**: Just added a dep to the [kay](https://pypi.org/project/kay/) package - **Tag maintainer**: @baskaryan @hwchase17 Discussed in slack - **Twtter handle:** [@vishalrohra_](https://twitter.com/vishalrohra_) --------- Co-authored-by: Bagatur --- .../document_loaders/aws_s3_directory.ipynb | 305 ++-- .../document_loaders/aws_s3_file.ipynb | 233 +-- .../dynamodb_chat_message_history.ipynb | 684 ++++----- docs/extras/integrations/retrievers/kay.ipynb | 207 +++ .../integrations/retrievers/pubmed.ipynb | 2 +- .../integrations/retrievers/sec_filings.ipynb | 165 +++ docs/extras/use_cases/web_scraping.ipynb | 1299 +++++++++-------- .../langchain/retrievers/__init__.py | 2 + libs/langchain/langchain/retrievers/kay.py | 59 + .../integration_tests/retrievers/test_kay.py | 24 + 10 files changed, 1722 insertions(+), 1258 deletions(-) create mode 100644 docs/extras/integrations/retrievers/kay.ipynb create mode 100644 docs/extras/integrations/retrievers/sec_filings.ipynb create mode 100644 libs/langchain/langchain/retrievers/kay.py create mode 100644 libs/langchain/tests/integration_tests/retrievers/test_kay.py diff --git a/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb b/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb index 4d474ca64a..e40a3702c6 100644 --- a/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb +++ b/docs/extras/integrations/document_loaders/aws_s3_directory.ipynb @@ -1,156 +1,159 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "a634365e", - "metadata": {}, - "source": [ - "# AWS S3 Directory\n", - "\n", - ">[Amazon Simple Storage Service (Amazon S3)](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html) is an object storage service\n", - "\n", - ">[AWS S3 Directory](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html)\n", - "\n", - "This covers how to load document objects from an `AWS S3 Directory` object." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "49815096", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "#!pip install boto3" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "2f0cd6a5", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from langchain.document_loaders import S3DirectoryLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "321cc7f1", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "loader = S3DirectoryLoader(\"testing-hwc\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2b11d155", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "loader.load()" - ] - }, - { - "cell_type": "markdown", - "id": "0690c40a", - "metadata": {}, - "source": [ - "## Specifying a prefix\n", - "You can also specify a prefix for more finegrained control over what files to load." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "72d44781", - "metadata": {}, - "outputs": [], - "source": [ - "loader = S3DirectoryLoader(\"testing-hwc\", prefix=\"fake\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "2d3c32db", - "metadata": {}, - "outputs": [ + "cells": [ + { + "cell_type": "markdown", + "id": "a634365e", + "metadata": {}, + "source": [ + "# AWS S3 Directory\n", + "\n", + ">[Amazon Simple Storage Service (Amazon S3)](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html) is an object storage service\n", + "\n", + ">[AWS S3 Directory](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html)\n", + "\n", + "This covers how to load document objects from an `AWS S3 Directory` object." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49815096", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "#!pip install boto3" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "2f0cd6a5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from langchain.document_loaders import S3DirectoryLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "321cc7f1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "loader = S3DirectoryLoader(\"testing-hwc\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b11d155", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "0690c40a", + "metadata": {}, + "source": [ + "## Specifying a prefix\n", + "You can also specify a prefix for more finegrained control over what files to load." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "72d44781", + "metadata": {}, + "outputs": [], + "source": [ + "loader = S3DirectoryLoader(\"testing-hwc\", prefix=\"fake\")" + ] + }, { - "data": { - "text/plain": [ - "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]" + "cell_type": "code", + "execution_count": 6, + "id": "2d3c32db", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader.load()" ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" + }, + { + "cell_type": "markdown", + "source": [ + "## Configuring the AWS Boto3 client\n", + "You can configure the AWS [Boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) client by passing\n", + "named arguments when creating the S3DirectoryLoader.\n", + "This is useful for instance when AWS credentials can't be set as environment variables.\n", + "See the [list of parameters](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session) that can be configured." + ], + "metadata": {}, + "id": "91a7ac07" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "loader = S3DirectoryLoader(\"testing-hwc\", aws_access_key_id=\"xxxx\", aws_secret_access_key=\"yyyy\")" + ], + "metadata": {}, + "id": "f485ec8c" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "loader.load()" + ], + "metadata": {}, + "id": "c0fa76ae" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" } - ], - "source": [ - "loader.load()" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Configuring the AWS Boto3 client\n", - "You can configure the AWS [Boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) client by passing\n", - "named arguments when creating the S3DirectoryLoader.\n", - "This is useful for instance when AWS credentials can't be set as environment variables.\n", - "See the [list of parameters](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session) that can be configured." - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "loader = S3DirectoryLoader(\"testing-hwc\", aws_access_key_id=\"xxxx\", aws_secret_access_key=\"yyyy\")" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "loader.load()" - ], - "metadata": {} - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/docs/extras/integrations/document_loaders/aws_s3_file.ipynb b/docs/extras/integrations/document_loaders/aws_s3_file.ipynb index 91d7751eb6..a13fcf5cff 100644 --- a/docs/extras/integrations/document_loaders/aws_s3_file.ipynb +++ b/docs/extras/integrations/document_loaders/aws_s3_file.ipynb @@ -1,121 +1,122 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "66a7777e", - "metadata": {}, - "source": [ - "# AWS S3 File\n", - "\n", - ">[Amazon Simple Storage Service (Amazon S3)](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html) is an object storage service.\n", - "\n", - ">[AWS S3 Buckets](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingBucket.html)\n", - "\n", - "This covers how to load document objects from an `AWS S3 File` object." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "9ec8a3b3", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.document_loaders import S3FileLoader" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "43128d8d", - "metadata": {}, - "outputs": [], - "source": [ - "#!pip install boto3" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "35d6809a", - "metadata": {}, - "outputs": [], - "source": [ - "loader = S3FileLoader(\"testing-hwc\", \"fake.docx\")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "efd6be84", - "metadata": {}, - "outputs": [ + "cells": [ + { + "cell_type": "markdown", + "id": "66a7777e", + "metadata": {}, + "source": [ + "# AWS S3 File\n", + "\n", + ">[Amazon Simple Storage Service (Amazon S3)](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-folders.html) is an object storage service.\n", + "\n", + ">[AWS S3 Buckets](https://docs.aws.amazon.com/AmazonS3/latest/userguide/UsingBucket.html)\n", + "\n", + "This covers how to load document objects from an `AWS S3 File` object." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "9ec8a3b3", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import S3FileLoader" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "43128d8d", + "metadata": {}, + "outputs": [], + "source": [ + "#!pip install boto3" + ] + }, { - "data": { - "text/plain": [ - "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]" + "cell_type": "code", + "execution_count": 8, + "id": "35d6809a", + "metadata": {}, + "outputs": [], + "source": [ + "loader = S3FileLoader(\"testing-hwc\", \"fake.docx\")" ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "efd6be84", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Lorem ipsum dolor sit amet.', lookup_str='', metadata={'source': 's3://testing-hwc/fake.docx'}, lookup_index=0)]" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader.load()" + ] + }, + { + "cell_type": "markdown", + "id": "93689594", + "metadata": {}, + "source": [ + "## Configuring the AWS Boto3 client\n", + "You can configure the AWS [Boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) client by passing\n", + "named arguments when creating the S3DirectoryLoader.\n", + "This is useful for instance when AWS credentials can't be set as environment variables.\n", + "See the [list of parameters](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session) that can be configured." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "loader = S3FileLoader(\"testing-hwc\", \"fake.docx\", aws_access_key_id=\"xxxx\", aws_secret_access_key=\"yyyy\")" + ], + "metadata": {}, + "id": "43106ee8" + }, + { + "cell_type": "code", + "execution_count": null, + "outputs": [], + "source": [ + "loader.load()" + ], + "metadata": {}, + "id": "1764a727" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" } - ], - "source": [ - "loader.load()" - ] - }, - { - "cell_type": "markdown", - "id": "93689594", - "metadata": {}, - "source": [ - "## Configuring the AWS Boto3 client\n", - "You can configure the AWS [Boto3](https://boto3.amazonaws.com/v1/documentation/api/latest/index.html) client by passing\n", - "named arguments when creating the S3DirectoryLoader.\n", - "This is useful for instance when AWS credentials can't be set as environment variables.\n", - "See the [list of parameters](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/core/session.html#boto3.session.Session) that can be configured." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "loader = S3FileLoader(\"testing-hwc\", \"fake.docx\", aws_access_key_id=\"xxxx\", aws_secret_access_key=\"yyyy\")" - ], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, - "outputs": [], - "source": [ - "loader.load()" - ], - "metadata": {} - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.6" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} - + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/docs/extras/integrations/memory/dynamodb_chat_message_history.ipynb b/docs/extras/integrations/memory/dynamodb_chat_message_history.ipynb index 53e7230e2b..3f324ef5e8 100644 --- a/docs/extras/integrations/memory/dynamodb_chat_message_history.ipynb +++ b/docs/extras/integrations/memory/dynamodb_chat_message_history.ipynb @@ -1,350 +1,352 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "91c6a7ef", - "metadata": {}, - "source": [ - "# Dynamodb Chat Message History\n", - "\n", - "This notebook goes over how to use Dynamodb to store chat message history." - ] - }, - { - "cell_type": "markdown", - "id": "3f608be0", - "metadata": {}, - "source": [ - "First make sure you have correctly configured the [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html). Then make sure you have installed boto3." - ] - }, - { - "cell_type": "markdown", - "id": "030d784f", - "metadata": {}, - "source": [ - "Next, create the DynamoDB Table where we will be storing messages:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "93ce1811", - "metadata": {}, - "outputs": [ + "cells": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "0\n" - ] - } - ], - "source": [ - "import boto3\n", - "\n", - "# Get the service resource.\n", - "dynamodb = boto3.resource(\"dynamodb\")\n", - "\n", - "# Create the DynamoDB table.\n", - "table = dynamodb.create_table(\n", - " TableName=\"SessionTable\",\n", - " KeySchema=[{\"AttributeName\": \"SessionId\", \"KeyType\": \"HASH\"}],\n", - " AttributeDefinitions=[{\"AttributeName\": \"SessionId\", \"AttributeType\": \"S\"}],\n", - " BillingMode=\"PAY_PER_REQUEST\",\n", - ")\n", - "\n", - "# Wait until the table exists.\n", - "table.meta.client.get_waiter(\"table_exists\").wait(TableName=\"SessionTable\")\n", - "\n", - "# Print out some data about the table.\n", - "print(table.item_count)" - ] - }, - { - "cell_type": "markdown", - "id": "1a9b310b", - "metadata": {}, - "source": [ - "## DynamoDBChatMessageHistory" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "d15e3302", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.memory.chat_message_histories import DynamoDBChatMessageHistory\n", - "\n", - "history = DynamoDBChatMessageHistory(table_name=\"SessionTable\", session_id=\"0\")\n", - "\n", - "history.add_user_message(\"hi!\")\n", - "\n", - "history.add_ai_message(\"whats up?\")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "64fc465e", - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "id": "91c6a7ef", + "metadata": {}, + "source": [ + "# Dynamodb Chat Message History\n", + "\n", + "This notebook goes over how to use Dynamodb to store chat message history." + ] + }, { - "data": { - "text/plain": "[HumanMessage(content='hi!', additional_kwargs={}, example=False),\n AIMessage(content='whats up?', additional_kwargs={}, example=False),\n HumanMessage(content='hi!', additional_kwargs={}, example=False),\n AIMessage(content='whats up?', additional_kwargs={}, example=False)]" - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "history.messages" - ] - }, - { - "cell_type": "markdown", - "id": "955f1b15", - "metadata": {}, - "source": [ - "## DynamoDBChatMessageHistory with Custom Endpoint URL\n", - "\n", - "Sometimes it is useful to specify the URL to the AWS endpoint to connect to. For instance, when you are running locally against [Localstack](https://localstack.cloud/). For those cases you can specify the URL via the `endpoint_url` parameter in the constructor." - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "225713c8", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.memory.chat_message_histories import DynamoDBChatMessageHistory\n", - "\n", - "history = DynamoDBChatMessageHistory(\n", - " table_name=\"SessionTable\",\n", - " session_id=\"0\",\n", - " endpoint_url=\"http://localhost.localstack.cloud:4566\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "source": [ - "## DynamoDBChatMessageHistory With Different Keys Composite Keys\n", - "The default key for DynamoDBChatMessageHistory is ```{\"SessionId\": self.session_id}```, but you can modify this to match your table design.\n", - "\n", - "### Primary Key Name\n", - "You may modify the primary key by passing in a primary_key_name value in the constructor, resulting in the following:\n", - "```{self.primary_key_name: self.session_id}```\n", - "\n", - "### Composite Keys\n", - "When using an existing DynamoDB table, you may need to modify the key structure from the default of to something including a Sort Key. To do this you may use the ```key``` parameter.\n", - "\n", - "Passing a value for key will override the primary_key parameter, and the resulting key structure will be the passed value.\n" - ], - "metadata": { - "collapsed": false - } - }, - { - "cell_type": "code", - "execution_count": 14, - "outputs": [ + "cell_type": "markdown", + "id": "3f608be0", + "metadata": {}, + "source": [ + "First make sure you have correctly configured the [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html). Then make sure you have installed boto3." + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "0\n" - ] + "cell_type": "markdown", + "id": "030d784f", + "metadata": {}, + "source": [ + "Next, create the DynamoDB Table where we will be storing messages:" + ] }, { - "data": { - "text/plain": "[HumanMessage(content='hello, composite dynamodb table!', additional_kwargs={}, example=False)]" - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "from langchain.memory.chat_message_histories import DynamoDBChatMessageHistory\n", - "\n", - "composite_table = dynamodb.create_table(\n", - " TableName=\"CompositeTable\",\n", - " KeySchema=[{\"AttributeName\": \"PK\", \"KeyType\": \"HASH\"}, {\"AttributeName\": \"SK\", \"KeyType\": \"RANGE\"}],\n", - " AttributeDefinitions=[{\"AttributeName\": \"PK\", \"AttributeType\": \"S\"}, {\"AttributeName\": \"SK\", \"AttributeType\": \"S\"}],\n", - " BillingMode=\"PAY_PER_REQUEST\",\n", - ")\n", - "\n", - "# Wait until the table exists.\n", - "composite_table.meta.client.get_waiter(\"table_exists\").wait(TableName=\"CompositeTable\")\n", - "\n", - "# Print out some data about the table.\n", - "print(composite_table.item_count)\n", - "\n", - "my_key = {\n", - " \"PK\": \"session_id::0\",\n", - " \"SK\": \"langchain_history\",\n", - "}\n", - "\n", - "composite_key_history = DynamoDBChatMessageHistory(\n", - " table_name=\"CompositeTable\",\n", - " session_id=\"0\",\n", - " endpoint_url=\"http://localhost.localstack.cloud:4566\",\n", - " key=my_key,\n", - ")\n", - "\n", - "composite_key_history.add_user_message(\"hello, composite dynamodb table!\")\n", - "\n", - "composite_key_history.messages" - ], - "metadata": { - "collapsed": false - } - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "3b33c988", - "metadata": {}, - "source": [ - "## Agent with DynamoDB Memory" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "f92d9499", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.agents import Tool\n", - "from langchain.memory import ConversationBufferMemory\n", - "from langchain.chat_models import ChatOpenAI\n", - "from langchain.agents import initialize_agent\n", - "from langchain.agents import AgentType\n", - "from langchain.utilities import PythonREPL\n", - "from getpass import getpass\n", - "\n", - "message_history = DynamoDBChatMessageHistory(table_name=\"SessionTable\", session_id=\"1\")\n", - "memory = ConversationBufferMemory(\n", - " memory_key=\"chat_history\", chat_memory=message_history, return_messages=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "1167eeba", - "metadata": {}, - "outputs": [], - "source": [ - "python_repl = PythonREPL()\n", - "\n", - "# You can create the tool to pass to an agent\n", - "tools = [\n", - " Tool(\n", - " name=\"python_repl\",\n", - " description=\"A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.\",\n", - " func=python_repl.run,\n", - " )\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "fce085c5", - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 10, + "id": "93ce1811", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + } + ], + "source": [ + "import boto3\n", + "\n", + "# Get the service resource.\n", + "dynamodb = boto3.resource(\"dynamodb\")\n", + "\n", + "# Create the DynamoDB table.\n", + "table = dynamodb.create_table(\n", + " TableName=\"SessionTable\",\n", + " KeySchema=[{\"AttributeName\": \"SessionId\", \"KeyType\": \"HASH\"}],\n", + " AttributeDefinitions=[{\"AttributeName\": \"SessionId\", \"AttributeType\": \"S\"}],\n", + " BillingMode=\"PAY_PER_REQUEST\",\n", + ")\n", + "\n", + "# Wait until the table exists.\n", + "table.meta.client.get_waiter(\"table_exists\").wait(TableName=\"SessionTable\")\n", + "\n", + "# Print out some data about the table.\n", + "print(table.item_count)" + ] + }, + { + "cell_type": "markdown", + "id": "1a9b310b", + "metadata": {}, + "source": [ + "## DynamoDBChatMessageHistory" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "d15e3302", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.memory.chat_message_histories import DynamoDBChatMessageHistory\n", + "\n", + "history = DynamoDBChatMessageHistory(table_name=\"SessionTable\", session_id=\"0\")\n", + "\n", + "history.add_user_message(\"hi!\")\n", + "\n", + "history.add_ai_message(\"whats up?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "64fc465e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": "[HumanMessage(content='hi!', additional_kwargs={}, example=False),\n AIMessage(content='whats up?', additional_kwargs={}, example=False),\n HumanMessage(content='hi!', additional_kwargs={}, example=False),\n AIMessage(content='whats up?', additional_kwargs={}, example=False)]" + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "history.messages" + ] + }, + { + "cell_type": "markdown", + "id": "955f1b15", + "metadata": {}, + "source": [ + "## DynamoDBChatMessageHistory with Custom Endpoint URL\n", + "\n", + "Sometimes it is useful to specify the URL to the AWS endpoint to connect to. For instance, when you are running locally against [Localstack](https://localstack.cloud/). For those cases you can specify the URL via the `endpoint_url` parameter in the constructor." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "225713c8", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.memory.chat_message_histories import DynamoDBChatMessageHistory\n", + "\n", + "history = DynamoDBChatMessageHistory(\n", + " table_name=\"SessionTable\",\n", + " session_id=\"0\",\n", + " endpoint_url=\"http://localhost.localstack.cloud:4566\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## DynamoDBChatMessageHistory With Different Keys Composite Keys\n", + "The default key for DynamoDBChatMessageHistory is ```{\"SessionId\": self.session_id}```, but you can modify this to match your table design.\n", + "\n", + "### Primary Key Name\n", + "You may modify the primary key by passing in a primary_key_name value in the constructor, resulting in the following:\n", + "```{self.primary_key_name: self.session_id}```\n", + "\n", + "### Composite Keys\n", + "When using an existing DynamoDB table, you may need to modify the key structure from the default of to something including a Sort Key. To do this you may use the ```key``` parameter.\n", + "\n", + "Passing a value for key will override the primary_key parameter, and the resulting key structure will be the passed value.\n" + ], + "metadata": { + "collapsed": false + }, + "id": "c9bc0693" + }, + { + "cell_type": "code", + "execution_count": 14, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0\n" + ] + }, + { + "data": { + "text/plain": "[HumanMessage(content='hello, composite dynamodb table!', additional_kwargs={}, example=False)]" + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from langchain.memory.chat_message_histories import DynamoDBChatMessageHistory\n", + "\n", + "composite_table = dynamodb.create_table(\n", + " TableName=\"CompositeTable\",\n", + " KeySchema=[{\"AttributeName\": \"PK\", \"KeyType\": \"HASH\"}, {\"AttributeName\": \"SK\", \"KeyType\": \"RANGE\"}],\n", + " AttributeDefinitions=[{\"AttributeName\": \"PK\", \"AttributeType\": \"S\"}, {\"AttributeName\": \"SK\", \"AttributeType\": \"S\"}],\n", + " BillingMode=\"PAY_PER_REQUEST\",\n", + ")\n", + "\n", + "# Wait until the table exists.\n", + "composite_table.meta.client.get_waiter(\"table_exists\").wait(TableName=\"CompositeTable\")\n", + "\n", + "# Print out some data about the table.\n", + "print(composite_table.item_count)\n", + "\n", + "my_key = {\n", + " \"PK\": \"session_id::0\",\n", + " \"SK\": \"langchain_history\",\n", + "}\n", + "\n", + "composite_key_history = DynamoDBChatMessageHistory(\n", + " table_name=\"CompositeTable\",\n", + " session_id=\"0\",\n", + " endpoint_url=\"http://localhost.localstack.cloud:4566\",\n", + " key=my_key,\n", + ")\n", + "\n", + "composite_key_history.add_user_message(\"hello, composite dynamodb table!\")\n", + "\n", + "composite_key_history.messages" + ], + "metadata": { + "collapsed": false + }, + "id": "a7fa0331" + }, { - "ename": "ValidationError", - "evalue": "1 validation error for ChatOpenAI\n__root__\n Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. (type=value_error)", - "output_type": "error", - "traceback": [ - "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[0;31mValidationError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[0;32mIn[17], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m llm \u001B[38;5;241m=\u001B[39m \u001B[43mChatOpenAI\u001B[49m\u001B[43m(\u001B[49m\u001B[43mtemperature\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[38;5;241;43m0\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[1;32m 2\u001B[0m agent_chain \u001B[38;5;241m=\u001B[39m initialize_agent(\n\u001B[1;32m 3\u001B[0m tools,\n\u001B[1;32m 4\u001B[0m llm,\n\u001B[0;32m (...)\u001B[0m\n\u001B[1;32m 7\u001B[0m memory\u001B[38;5;241m=\u001B[39mmemory,\n\u001B[1;32m 8\u001B[0m )\n", - "File \u001B[0;32m~/Documents/projects/langchain/libs/langchain/langchain/load/serializable.py:74\u001B[0m, in \u001B[0;36mSerializable.__init__\u001B[0;34m(self, **kwargs)\u001B[0m\n\u001B[1;32m 73\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21m__init__\u001B[39m(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs: Any) \u001B[38;5;241m-\u001B[39m\u001B[38;5;241m>\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m:\n\u001B[0;32m---> 74\u001B[0m \u001B[38;5;28;43msuper\u001B[39;49m\u001B[43m(\u001B[49m\u001B[43m)\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[38;5;21;43m__init__\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mkwargs\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 75\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_lc_kwargs \u001B[38;5;241m=\u001B[39m kwargs\n", - "File \u001B[0;32m~/Documents/projects/langchain/.venv/lib/python3.9/site-packages/pydantic/main.py:341\u001B[0m, in \u001B[0;36mpydantic.main.BaseModel.__init__\u001B[0;34m()\u001B[0m\n", - "\u001B[0;31mValidationError\u001B[0m: 1 validation error for ChatOpenAI\n__root__\n Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. (type=value_error)" - ] + "attachments": {}, + "cell_type": "markdown", + "id": "3b33c988", + "metadata": {}, + "source": [ + "## Agent with DynamoDB Memory" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "f92d9499", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.agents import Tool\n", + "from langchain.memory import ConversationBufferMemory\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.agents import initialize_agent\n", + "from langchain.agents import AgentType\n", + "from langchain.utilities import PythonREPL\n", + "from getpass import getpass\n", + "\n", + "message_history = DynamoDBChatMessageHistory(table_name=\"SessionTable\", session_id=\"1\")\n", + "memory = ConversationBufferMemory(\n", + " memory_key=\"chat_history\", chat_memory=message_history, return_messages=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "1167eeba", + "metadata": {}, + "outputs": [], + "source": [ + "python_repl = PythonREPL()\n", + "\n", + "# You can create the tool to pass to an agent\n", + "tools = [\n", + " Tool(\n", + " name=\"python_repl\",\n", + " description=\"A Python shell. Use this to execute python commands. Input should be a valid python command. If you want to see the output of a value, you should print it out with `print(...)`.\",\n", + " func=python_repl.run,\n", + " )\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "fce085c5", + "metadata": {}, + "outputs": [ + { + "ename": "ValidationError", + "evalue": "1 validation error for ChatOpenAI\n__root__\n Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. (type=value_error)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[17], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m llm \u001b[38;5;241m=\u001b[39m \u001b[43mChatOpenAI\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtemperature\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m0\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2\u001b[0m agent_chain \u001b[38;5;241m=\u001b[39m initialize_agent(\n\u001b[1;32m 3\u001b[0m tools,\n\u001b[1;32m 4\u001b[0m llm,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 7\u001b[0m memory\u001b[38;5;241m=\u001b[39mmemory,\n\u001b[1;32m 8\u001b[0m )\n", + "File \u001b[0;32m~/Documents/projects/langchain/libs/langchain/langchain/load/serializable.py:74\u001b[0m, in \u001b[0;36mSerializable.__init__\u001b[0;34m(self, **kwargs)\u001b[0m\n\u001b[1;32m 73\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m__init__\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs: Any) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m---> 74\u001b[0m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__init__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 75\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_lc_kwargs \u001b[38;5;241m=\u001b[39m kwargs\n", + "File \u001b[0;32m~/Documents/projects/langchain/.venv/lib/python3.9/site-packages/pydantic/main.py:341\u001b[0m, in \u001b[0;36mpydantic.main.BaseModel.__init__\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mValidationError\u001b[0m: 1 validation error for ChatOpenAI\n__root__\n Did not find openai_api_key, please add an environment variable `OPENAI_API_KEY` which contains it, or pass `openai_api_key` as a named parameter. (type=value_error)" + ] + } + ], + "source": [ + "llm = ChatOpenAI(temperature=0)\n", + "agent_chain = initialize_agent(\n", + " tools,\n", + " llm,\n", + " agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,\n", + " verbose=True,\n", + " memory=memory,\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "952a3103", + "metadata": {}, + "outputs": [], + "source": [ + "agent_chain.run(input=\"Hello!\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "54c4aaf4", + "metadata": {}, + "outputs": [], + "source": [ + "agent_chain.run(input=\"Who owns Twitter?\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9013118", + "metadata": {}, + "outputs": [], + "source": [ + "agent_chain.run(input=\"My name is Bob.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "405e5315", + "metadata": {}, + "outputs": [], + "source": [ + "agent_chain.run(input=\"Who am I?\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" } - ], - "source": [ - "llm = ChatOpenAI(temperature=0)\n", - "agent_chain = initialize_agent(\n", - " tools,\n", - " llm,\n", - " agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION,\n", - " verbose=True,\n", - " memory=memory,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "952a3103", - "metadata": {}, - "outputs": [], - "source": [ - "agent_chain.run(input=\"Hello!\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "54c4aaf4", - "metadata": {}, - "outputs": [], - "source": [ - "agent_chain.run(input=\"Who owns Twitter?\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f9013118", - "metadata": {}, - "outputs": [], - "source": [ - "agent_chain.run(input=\"My name is Bob.\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "405e5315", - "metadata": {}, - "outputs": [], - "source": [ - "agent_chain.run(input=\"Who am I?\")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.3" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/docs/extras/integrations/retrievers/kay.ipynb b/docs/extras/integrations/retrievers/kay.ipynb new file mode 100644 index 0000000000..1175d11924 --- /dev/null +++ b/docs/extras/integrations/retrievers/kay.ipynb @@ -0,0 +1,207 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "263f914c-9d67-4316-8b3d-03c3b99ba9d8", + "metadata": {}, + "source": [ + "Kay.ai\n", + "=\n", + "\n", + "> Data API built for RAG 🕵️ We are curating the world's largest datasets as high-quality embeddings so your AI agents can retrieve context on the fly. Latest models, fast retrieval, and zero infra.\n", + "\n", + "This notebook shows you how to retrieve datasets supported by [Kay](https://kay.ai/). You can currently search SEC Filings and Press Releases of US companies. Visit [kay.ai](https://kay.ai) for the latest data drops. For any questions, join our [discord](https://discord.gg/hAnE4e5T6M) or [tweet at us](https://twitter.com/vishalrohra_)" + ] + }, + { + "cell_type": "markdown", + "id": "fc507b8e-ea51-417c-93da-42bf998a1195", + "metadata": {}, + "source": [ + "Installation\n", + "=\n", + "\n", + "First you will need to install the [`kay` package](https://pypi.org/project/kay/). You will also need an API key: you can get one for free at [https://kay.ai](https://kay.ai/). Once you have an API key, you must set it as an environment variable `KAY_API_KEY`.\n", + "\n", + "`KayAiRetriever` has a static `.create()` factory method that takes the following arguments:\n", + "\n", + "* `dataset_id: string` required -- A Kay dataset id. This is a collection of data about a particular entity such as companies, people, or places. For example, try `\"company\"` \n", + "* `data_type: List[string]` optional -- This is a category within a dataset based on its origin or format, such as ‘SEC Filings’, ‘Press Releases’, or ‘Reports’ within the “company” dataset. For example, try [\"10-K\", \"10-Q\", \"PressRelease\"] under the “company” dataset. If left empty, Kay will retrieve the most relevant context across all types.\n", + "* `num_contexts: int` optional, defaults to 6 -- The number of document chunks to retrieve on each call to `get_relevant_documents()`" + ] + }, + { + "cell_type": "markdown", + "id": "c923bea0-585a-4f62-8662-efc167e8d793", + "metadata": {}, + "source": [ + "Examples\n", + "=\n", + "\n", + "Basic Retriever Usage\n", + "-" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f7b8c99c-0341-4f3c-912f-a11e98f7de71", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ········\n" + ] + } + ], + "source": [ + "# Setup API key\n", + "from getpass import getpass\n", + "KAY_API_KEY = getpass()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "b4d4d386-2a6b-4942-863e-9202f5a9f1d6", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.retrievers import KayAiRetriever\n", + "import os\n", + "from kay.rag.retrievers import KayRetriever\n", + "os.environ[\"KAY_API_KEY\"] = KAY_API_KEY\n", + "retriever = KayAiRetriever.create(dataset_id=\"company\", data_types=[\"10-K\", \"10-Q\", \"PressRelease\"], num_contexts=3)\n", + "docs = retriever.get_relevant_documents(\"What were the biggest strategy changes and partnerships made by Roku in 2023??\")" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "04ee2d6b-c2ab-4e15-8a8b-afaf6ef8c0f6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[Document(page_content='Company Name: ROKU INC\\nCompany Industry: CABLE & OTHER PAY TELEVISION SERVICES\\nArticle Title: Roku and FreeWheel Announce Strategic Partnership to Bring Roku’s Leading Ad Tech to FreeWheel Customers\\nText: Additionally, eMarketer Link: https://cts.businesswire.com/ct/CT?id=smartlink&url=https%3A%2F%2Fwww.insiderintelligence.com%2Finsights%2Favod-more-than-50-percent-of-us-digital-video-viewers%2F&esheet=53451144&newsitemid=20230712907788&lan=en-US&anchor=eMarketer&index=4&md5=b64dea72bcf6b6379474462602781d83 projects 57% of U.S. digital video users will stream an advertising-based video on demand (AVOD) service this year.\\nHaving solutions aimed at driving greater interoperability and automation will help accelerate this growth.\\nKey highlights of this collaboration include:\\nStreamlined Integration: Roku has now integrated its demand application programming interface (dAPI) with FreeWheel s TV platform. Roku s demand API gives publishers direct, automatic and real-time access to more advertiser demand. This enhanced integration allows for streamlined ad operation workflows and better inventory quality control, both of which will improve publisher yield and revenue.\\nSeamless Data Targeting: Publishers can now use Roku platform signals to enable advertisers to target audiences and measure campaign performance without relying on cookies. Additionally, FreeWheel and Roku will rely on data clean room technology to enable the activation of additional data sets providing better measurement and monetization to publishers and agencies.', metadata={'_additional': {'id': '962b79e0-f9d1-43ae-9f7a-8a9b42bc7a9a'}, 'chunk_type': 'text', 'chunk_years_mentioned': [], 'company_name': 'ROKU INC', 'company_sic_code_description': 'CABLE & OTHER PAY TELEVISION SERVICES', 'data_source': 'PressRelease', 'data_source_link': 'https://www.nasdaq.com/press-release/roku-and-freewheel-announce-strategic-partnership-to-bring-rokus-leading-ad-tech-to', 'data_source_publish_date': '2023-07-12T00:00:00Z', 'data_source_uid': 'a46f309c-705d-3946-96db-87aa4e73261f', 'title': 'ROKU INC | Roku and FreeWheel Announce Strategic Partnership to Bring Roku’s Leading Ad Tech to FreeWheel Customers'}),\n", + " Document(page_content='Company Name: ROKU INC \\n Company Industry: CABLE & OTHER PAY TELEVISION SERVICES \\n Form Title: 10-K 2022-FY \\n Form Section: Risk Factors \\n Text: nd the Note Regarding Forward Looking Statements.This section of this Annual Report generally discusses fiscal years 2022 and 2021 and year to year comparisons between those years.Discussions of fiscal year 2020 and year to year comparisons between fiscal years 2021 and 2020 that are not included in this Annual Report can be found in Management\\'s Discussion and Analysis of Financial Condition and Results of Operations in Part II, Item 7 of our Annual Report for the fiscal year ended December 31, 2021 filed with the SEC on February 18, 2022.Overview Effective as of the fourth quarter of fiscal 2022, we reorganized our reportable segments to better align with management\\'s reporting of information reviewed by the Chief Operating Decision Maker (\"CODM\") for each segment.We renamed our \"player\" segment to \"devices\" which now includes our licensing arrangements with service operators and licensed Roku TV partners in addition to sales of our streaming players, audio products, smart home products and Roku branded TVs that will be designed, made, and sold by us in 2023.Our historical segment information is recast to conform to our new presentation in our financial statements and accompanying notes included in Item 8 of this Annual Report.Our two reportable segments are the platform segment and the devices segment.', metadata={'_additional': {'id': 'a76c5fed-5d63-45a7-b63a-2c30e05140fc'}, 'chunk_type': 'text', 'chunk_years_mentioned': [2020, 2021, 2022, 2023], 'company_name': 'ROKU INC', 'company_sic_code_description': 'CABLE & OTHER PAY TELEVISION SERVICES', 'data_source': '10-K', 'data_source_link': 'https://www.sec.gov/Archives/edgar/data/1428439/000142843923000007', 'data_source_publish_date': '2022-01-01T00:00:00Z', 'data_source_uid': '0001428439-23-000007', 'title': 'ROKU INC | 10-K 2022-FY '}),\n", + " Document(page_content='Company Name: ROKU INC \\n Company Industry: CABLE & OTHER PAY TELEVISION SERVICES \\n Form Title: 10-Q 2023-Q1 \\n Form Section: Risk Factors \\n Text: Our current and potential partners include TV brands, cable and satellite companies, and telecommunication providers.Under these license arrangements, we generally have limited or no control over the amount and timing of resources these entities dedicate to the relationship.In the past, our licensed Roku TV partners have failed to meet their forecasts and anticipated market launch dates for distributing Roku TV models, and they may fail to meet their forecasts or such launches in the future.If our licensed Roku TV partners or service operator partners fail to meet their forecasts or such launches for distributing licensed streaming devices or choose to deploy competing streaming solutions within their product lines, our business may be harmed.We depend on a small number of content publishers for a majority of our streaming hours, and if we fail to maintain these relationships, our business could be harmed.*Historically, a small number of content publishers have accounted for a significant portion of the hours streamed on our platform.In the three months ended March 31, 2023, the top three streaming services represented over 50% of all hours streamed in the period.If, for any reason, we cease distributing channels that have historically streamed a large percentage of the aggregate streaming hours on our platform, our streaming hours, our active accounts, or Roku streaming device sales may be adversely affected, and our business may be harmed.', metadata={'_additional': {'id': '2a92b2bb-02a0-4e15-8b64-d7e04078a205'}, 'chunk_type': 'text', 'chunk_years_mentioned': [2023], 'company_name': 'ROKU INC', 'company_sic_code_description': 'CABLE & OTHER PAY TELEVISION SERVICES', 'data_source': '10-Q', 'data_source_link': 'https://www.sec.gov/Archives/edgar/data/1428439/000142843923000017', 'data_source_publish_date': '2023-01-01T00:00:00Z', 'data_source_uid': '0001428439-23-000017', 'title': 'ROKU INC | 10-Q 2023-Q1 '})]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs" + ] + }, + { + "cell_type": "markdown", + "id": "21f6e9e5-478c-4b2c-9d61-f7a84f4d2f8f", + "metadata": {}, + "source": [ + "Usage in a chain\n", + "-" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d1cba716-ab8d-4518-9196-43f17eb189dc", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ········\n" + ] + } + ], + "source": [ + "OPENAI_API_KEY = getpass()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "79441f1f-fa06-452c-bcd6-160ad0debc6a", + "metadata": {}, + "outputs": [], + "source": [ + "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "0c504bcd-f6e0-4028-a797-b31fb4b6d027", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.chains import ConversationalRetrievalChain\n", + "\n", + "model = ChatOpenAI(model_name=\"gpt-3.5-turbo\")\n", + "qa = ConversationalRetrievalChain.from_llm(model, retriever=retriever)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "977f158b-38d3-4b5f-9379-7cdd09436327", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-> **Question**: What were the biggest strategy changes and partnerships made by Roku in 2023? \n", + "\n", + "**Answer**: In 2023, Roku made a strategic partnership with FreeWheel to bring Roku's leading ad tech to FreeWheel customers. This partnership aimed to drive greater interoperability and automation in the advertising-based video on demand (AVOD) space. Key highlights of this collaboration include streamlined integration of Roku's demand application programming interface (dAPI) with FreeWheel's TV platform, allowing for better inventory quality control and improved publisher yield and revenue. Additionally, publishers can now use Roku platform signals to enable advertisers to target audiences and measure campaign performance without relying on cookies. This partnership also involves the use of data clean room technology to enable the activation of additional data sets for better measurement and monetization for publishers and agencies. These partnerships and strategies aim to support Roku's growth in the AVOD market. \n", + "\n" + ] + } + ], + "source": [ + "questions = [\n", + " \"What were the biggest strategy changes and partnerships made by Roku in 2023?\"\n", + " # \"Where is Wex making the most money in 2023?\",\n", + "]\n", + "chat_history = []\n", + "\n", + "for question in questions:\n", + " result = qa({\"question\": question, \"chat_history\": chat_history})\n", + " chat_history.append((question, result[\"answer\"]))\n", + " print(f\"-> **Question**: {question} \\n\")\n", + " print(f\"**Answer**: {result['answer']} \\n\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/integrations/retrievers/pubmed.ipynb b/docs/extras/integrations/retrievers/pubmed.ipynb index 336ee5fdd8..590538f6e8 100644 --- a/docs/extras/integrations/retrievers/pubmed.ipynb +++ b/docs/extras/integrations/retrievers/pubmed.ipynb @@ -81,7 +81,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.9.18" } }, "nbformat": 4, diff --git a/docs/extras/integrations/retrievers/sec_filings.ipynb b/docs/extras/integrations/retrievers/sec_filings.ipynb new file mode 100644 index 0000000000..7b2f8c3255 --- /dev/null +++ b/docs/extras/integrations/retrievers/sec_filings.ipynb @@ -0,0 +1,165 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "263f914c-9d67-4316-8b3d-03c3b99ba9d8", + "metadata": {}, + "source": [ + "SEC filings data\n", + "=\n", + "\n", + "SEC filings data powered by [Kay.ai](https://kay.ai) and [Cybersyn](https://www.cybersyn.com/).\n", + "\n", + ">The SEC filing is a financial statement or other formal document submitted to the U.S. Securities and Exchange Commission (SEC). Public companies, certain insiders, and broker-dealers are required to make regular SEC filings. Investors and financial professionals rely on these filings for information about companies they are evaluating for investment purposes." + ] + }, + { + "cell_type": "markdown", + "id": "fc507b8e-ea51-417c-93da-42bf998a1195", + "metadata": {}, + "source": [ + "Setup\n", + "=\n", + "\n", + "First you will need to install the `kay` package. You will also need an API key: you can get one for free at [https://kay.ai](https://kay.ai/). Once you have an API key, you must set it as an environment variable `KAY_API_KEY`.\n", + "\n", + "In this example we're going to use the `KayAiRetriever`. Take a look at the [kay notebook](/docs/integrations/retrievers/kay) for more detailed information for the parmeters that it accepts.`" + ] + }, + { + "cell_type": "markdown", + "id": "c923bea0-585a-4f62-8662-efc167e8d793", + "metadata": {}, + "source": [ + "Examples\n", + "=\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f7b8c99c-0341-4f3c-912f-a11e98f7de71", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ········\n", + " ········\n" + ] + } + ], + "source": [ + "# Setup API keys for Kay and OpenAI\n", + "from getpass import getpass\n", + "KAY_API_KEY = getpass()\n", + "OPENAI_API_KEY = getpass()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "04ee2d6b-c2ab-4e15-8a8b-afaf6ef8c0f6", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"KAY_API_KEY\"] = KAY_API_KEY\n", + "os.environ[\"OPENAI_API_KEY\"] = OPENAI_API_KEY" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0c504bcd-f6e0-4028-a797-b31fb4b6d027", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains import ConversationalRetrievalChain\n", + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.retrievers import KayAiRetriever\n", + "\n", + "model = ChatOpenAI(model_name=\"gpt-3.5-turbo\")\n", + "retriever = KayAiRetriever.create(dataset_id=\"company\", data_types=[\"10-K\", \"10-Q\"], num_contexts=6)\n", + "qa = ConversationalRetrievalChain.from_llm(model, retriever=retriever)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "977f158b-38d3-4b5f-9379-7cdd09436327", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "-> **Question**: What are patterns in Nvidia's spend over the past three quarters? \n", + "\n", + "**Answer**: Based on the provided information, here are the patterns in NVIDIA's spend over the past three quarters:\n", + "\n", + "1. Research and Development Expenses:\n", + " - Q3 2022: Increased by 34% compared to Q3 2021.\n", + " - Q1 2023: Increased by 40% compared to Q1 2022.\n", + " - Q2 2022: Increased by 25% compared to Q2 2021.\n", + " \n", + " Overall, research and development expenses have been consistently increasing over the past three quarters.\n", + "\n", + "2. Sales, General and Administrative Expenses:\n", + " - Q3 2022: Increased by 8% compared to Q3 2021.\n", + " - Q1 2023: Increased by 14% compared to Q1 2022.\n", + " - Q2 2022: Decreased by 16% compared to Q2 2021.\n", + " \n", + " The pattern for sales, general and administrative expenses is not as consistent, with some quarters showing an increase and others showing a decrease.\n", + "\n", + "3. Total Operating Expenses:\n", + " - Q3 2022: Increased by 25% compared to Q3 2021.\n", + " - Q1 2023: Increased by 113% compared to Q1 2022.\n", + " - Q2 2022: Increased by 9% compared to Q2 2021.\n", + " \n", + " Total operating expenses have generally been increasing over the past three quarters, with a significant increase in Q1 2023.\n", + "\n", + "Overall, the pattern indicates a consistent increase in research and development expenses and total operating expenses, while sales, general and administrative expenses show some fluctuations. \n", + "\n" + ] + } + ], + "source": [ + "questions = [\n", + " \"What are patterns in Nvidia's spend over the past three quarters?\",\n", + " #\"What are some recent challenges faced by the renewable energy sector?\",\n", + "]\n", + "chat_history = []\n", + "\n", + "for question in questions:\n", + " result = qa({\"question\": question, \"chat_history\": chat_history})\n", + " chat_history.append((question, result[\"answer\"]))\n", + " print(f\"-> **Question**: {question} \\n\")\n", + " print(f\"**Answer**: {result['answer']} \\n\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.18" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/docs/extras/use_cases/web_scraping.ipynb b/docs/extras/use_cases/web_scraping.ipynb index 07a571aaf2..a9aa4a32b1 100644 --- a/docs/extras/use_cases/web_scraping.ipynb +++ b/docs/extras/use_cases/web_scraping.ipynb @@ -1,663 +1,664 @@ { - "cells": [ - { - "cell_type": "raw", - "id": "e254cf03-49fc-4051-a4df-3a8e4e7d2688", - "metadata": {}, - "source": [ - "---\n", - "sidebar_position: 1\n", - "title: Web scraping\n", - "---" - ] - }, - { - "cell_type": "markdown", - "id": "6605e7f7", - "metadata": {}, - "source": [ - "[![Open In Collab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/extras/use_cases/web_scraping.ipynb)\n", - "\n", - "## Use case\n", - "\n", - "[Web research](https://blog.langchain.dev/automating-web-research/) is one of the killer LLM applications:\n", - "\n", - "* Users have [highlighted it](https://twitter.com/GregKamradt/status/1679913813297225729?s=20) as one of his top desired AI tools. \n", - "* OSS repos like [gpt-researcher](https://github.com/assafelovic/gpt-researcher) are growing in popularity. \n", - " \n", - "![Image description](/img/web_scraping.png)\n", - " \n", - "## Overview\n", - "\n", - "Gathering content from the web has a few components:\n", - "\n", - "* `Search`: Query to url (e.g., using `GoogleSearchAPIWrapper`).\n", - "* `Loading`: Url to HTML (e.g., using `AsyncHtmlLoader`, `AsyncChromiumLoader`, etc).\n", - "* `Transforming`: HTML to formatted text (e.g., using `HTML2Text` or `Beautiful Soup`).\n", - "\n", - "## Quickstart" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1803c182", - "metadata": {}, - "outputs": [], - "source": [ - "pip install -q openai langchain playwright beautifulsoup4\n", - "playwright install\n", - "\n", - "# Set env var OPENAI_API_KEY or load from a .env file:\n", - "# import dotenv\n", - "# dotenv.load_dotenv()" - ] - }, - { - "cell_type": "markdown", - "id": "50741083", - "metadata": {}, - "source": [ - "Scraping HTML content using a headless instance of Chromium.\n", - "\n", - "* The async nature of the scraping process is handled using Python's asyncio library.\n", - "* The actual interaction with the web pages is handled by Playwright." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "cd457cb1", - "metadata": {}, - "outputs": [], - "source": [ - "from langchain.document_loaders import AsyncChromiumLoader\n", - "from langchain.document_transformers import BeautifulSoupTransformer\n", - "\n", - "# Load HTML\n", - "loader = AsyncChromiumLoader([\"https://www.wsj.com\"])\n", - "html = loader.load()" - ] - }, - { - "cell_type": "markdown", - "id": "2a879806", - "metadata": {}, - "source": [ - "Scrape text content tags such as `

,

  • ,
    , and ` tags from the HTML content:\n", - "\n", - "* `

    `: The paragraph tag. It defines a paragraph in HTML and is used to group together related sentences and/or phrases.\n", - " \n", - "* `

  • `: The list item tag. It is used within ordered (`
      `) and unordered (`