From c55ba43093c10df8de8f9273c12012573a7651a7 Mon Sep 17 00:00:00 2001 From: Harrison Chase Date: Fri, 28 Apr 2023 19:48:43 -0700 Subject: [PATCH] Harrison/vespa (#3761) Co-authored-by: Lester Solbakken --- .../retrievers/examples/vespa_retriever.ipynb | 124 ++++++++++++++++++ .../agent_simulations/multi_player_dnd.ipynb | 2 +- .../multiagent_authoritarian.ipynb | 2 +- .../agent_simulations/two_player_dnd.ipynb | 2 +- langchain/retrievers/__init__.py | 2 + langchain/retrievers/vespa_retriever.py | 44 +++++++ poetry.lock | 33 ++++- pyproject.toml | 3 +- 8 files changed, 204 insertions(+), 8 deletions(-) create mode 100644 docs/modules/indexes/retrievers/examples/vespa_retriever.ipynb create mode 100644 langchain/retrievers/vespa_retriever.py diff --git a/docs/modules/indexes/retrievers/examples/vespa_retriever.ipynb b/docs/modules/indexes/retrievers/examples/vespa_retriever.ipynb new file mode 100644 index 00000000..553991d9 --- /dev/null +++ b/docs/modules/indexes/retrievers/examples/vespa_retriever.ipynb @@ -0,0 +1,124 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ce0f17b9", + "metadata": {}, + "source": [ + "# Vespa retriever\n", + "\n", + "This notebook shows how to use Vespa.ai as a LangChain retriever.\n", + "Vespa.ai is a platform for highly efficient structured text and vector search.\n", + "Please refer to [Vespa.ai](https://vespa.ai) for more information.\n", + "\n", + "In order to create a retriever, we use [pyvespa](https://pyvespa.readthedocs.io/en/latest/index.html) to\n", + "create a connection a Vespa service." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "c10dd962", + "metadata": {}, + "outputs": [], + "source": [ + "from vespa.application import Vespa\n", + "\n", + "vespa_app = Vespa(url=\"https://doc-search.vespa.oath.cloud\")" + ] + }, + { + "cell_type": "markdown", + "id": "3df4ce53", + "metadata": {}, + "source": [ + "This creates a connection to a Vespa service, here the Vespa documentation search service.\n", + "Using pyvespa, you can also connect to a\n", + "[Vespa Cloud instance](https://pyvespa.readthedocs.io/en/latest/deploy-vespa-cloud.html)\n", + "or a local\n", + "[Docker instance](https://pyvespa.readthedocs.io/en/latest/deploy-docker.html).\n", + "\n", + "\n", + "After connecting to the service, you can set up the retriever:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ccca1f4", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "from langchain.retrievers.vespa_retriever import VespaRetriever\n", + "\n", + "vespa_query_body = {\n", + " \"yql\": \"select content from paragraph where userQuery()\",\n", + " \"hits\": 5,\n", + " \"ranking\": \"documentation\",\n", + " \"locale\": \"en-us\"\n", + "}\n", + "vespa_content_field = \"content\"\n", + "retriever = VespaRetriever(vespa_app, vespa_query_body, vespa_content_field)" + ] + }, + { + "cell_type": "markdown", + "id": "1e7e34e1", + "metadata": { + "pycharm": { + "name": "#%% md\n" + } + }, + "source": [ + "This sets up a LangChain retriever that fetches documents from the Vespa application.\n", + "Here, up to 5 results are retrieved from the `content` field in the `paragraph` document type,\n", + "using `doumentation` as the ranking method. The `userQuery()` is replaced with the actual query\n", + "passed from LangChain.\n", + "\n", + "Please refer to the [pyvespa documentation](https://pyvespa.readthedocs.io/en/latest/getting-started-pyvespa.html#Query)\n", + "for more information.\n", + "\n", + "Now you can return the results and continue using the results in LangChain." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "f47a2bfe", + "metadata": { + "pycharm": { + "name": "#%%\n" + } + }, + "outputs": [], + "source": [ + "retriever.get_relevant_documents(\"what is vespa?\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} \ No newline at end of file diff --git a/docs/use_cases/agent_simulations/multi_player_dnd.ipynb b/docs/use_cases/agent_simulations/multi_player_dnd.ipynb index d55d5924..612065e8 100644 --- a/docs/use_cases/agent_simulations/multi_player_dnd.ipynb +++ b/docs/use_cases/agent_simulations/multi_player_dnd.ipynb @@ -485,7 +485,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/use_cases/agent_simulations/multiagent_authoritarian.ipynb b/docs/use_cases/agent_simulations/multiagent_authoritarian.ipynb index 878ca884..bfab4f9e 100644 --- a/docs/use_cases/agent_simulations/multiagent_authoritarian.ipynb +++ b/docs/use_cases/agent_simulations/multiagent_authoritarian.ipynb @@ -841,7 +841,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/docs/use_cases/agent_simulations/two_player_dnd.ipynb b/docs/use_cases/agent_simulations/two_player_dnd.ipynb index ef5f2e2a..bf3359aa 100644 --- a/docs/use_cases/agent_simulations/two_player_dnd.ipynb +++ b/docs/use_cases/agent_simulations/two_player_dnd.ipynb @@ -410,7 +410,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.16" + "version": "3.9.1" } }, "nbformat": 4, diff --git a/langchain/retrievers/__init__.py b/langchain/retrievers/__init__.py index d89cf9d8..9137901f 100644 --- a/langchain/retrievers/__init__.py +++ b/langchain/retrievers/__init__.py @@ -10,6 +10,7 @@ from langchain.retrievers.tfidf import TFIDFRetriever from langchain.retrievers.time_weighted_retriever import ( TimeWeightedVectorStoreRetriever, ) +from langchain.retrievers.vespa_retriever import VespaRetriever from langchain.retrievers.weaviate_hybrid_search import WeaviateHybridSearchRetriever __all__ = [ @@ -24,4 +25,5 @@ __all__ = [ "DataberryRetriever", "TimeWeightedVectorStoreRetriever", "SVMRetriever", + "VespaRetriever", ] diff --git a/langchain/retrievers/vespa_retriever.py b/langchain/retrievers/vespa_retriever.py new file mode 100644 index 00000000..2451cb91 --- /dev/null +++ b/langchain/retrievers/vespa_retriever.py @@ -0,0 +1,44 @@ +"""Wrapper for retrieving documents from Vespa.""" + +from __future__ import annotations + +import json +from typing import TYPE_CHECKING, List + +from langchain.schema import BaseRetriever, Document + +if TYPE_CHECKING: + from vespa.application import Vespa + + +class VespaRetriever(BaseRetriever): + def __init__(self, app: Vespa, body: dict, content_field: str): + self._application = app + self._query_body = body + self._content_field = content_field + + def get_relevant_documents(self, query: str) -> List[Document]: + body = self._query_body.copy() + body["query"] = query + response = self._application.query(body) + + if not str(response.status_code).startswith("2"): + raise RuntimeError( + "Could not retrieve data from Vespa. Error code: {}".format( + response.status_code + ) + ) + + root = response.json["root"] + if "errors" in root: + raise RuntimeError(json.dumps(root["errors"])) + + hits = [] + for child in response.hits: + page_content = child["fields"][self._content_field] + metadata = {"id": child["id"]} + hits.append(Document(page_content=page_content, metadata=metadata)) + return hits + + async def aget_relevant_documents(self, query: str) -> List[Document]: + raise NotImplementedError diff --git a/poetry.lock b/poetry.lock index a5ae60c1..f3b6f0c6 100644 --- a/poetry.lock +++ b/poetry.lock @@ -5213,7 +5213,7 @@ test = ["mock", "pytest", "pytest-coverage", "typer-cli"] name = "pexpect" version = "4.8.0" description = "Pexpect allows easy control of interactive console applications." -category = "dev" +category = "main" optional = false python-versions = "*" files = [ @@ -5739,7 +5739,7 @@ files = [ name = "ptyprocess" version = "0.7.0" description = "Run a subprocess in a pseudo terminal" -category = "dev" +category = "main" optional = false python-versions = "*" files = [ @@ -6392,6 +6392,31 @@ files = [ {file = "pytz-2023.3.tar.gz", hash = "sha256:1d8ce29db189191fb55338ee6d0387d82ab59f3d00eac103412d64e0ebd0c588"}, ] +[[package]] +name = "pyvespa" +version = "0.33.0" +description = "Python API for vespa.ai" +category = "main" +optional = true +python-versions = ">=3.6" +files = [ + {file = "pyvespa-0.33.0-py3-none-any.whl", hash = "sha256:2681910b3ac5f0259a9e41e6e2649caba2801e836b4c295cc2e48ab25b09672c"}, + {file = "pyvespa-0.33.0.tar.gz", hash = "sha256:be3da9022276555b6b25c40b6e846db6e9dbf617486001ba92235ccfab6c9353"}, +] + +[package.dependencies] +aiohttp = "*" +cryptography = "*" +docker = "*" +jinja2 = "*" +pandas = "*" +requests = "*" +tenacity = "*" + +[package.extras] +full = ["keras-tuner", "onnxruntime", "tensorflow", "tensorflow-ranking", "torch (<1.13)", "transformers"] +ml = ["keras-tuner", "tensorflow", "tensorflow-ranking", "torch (<1.13)", "transformers"] + [[package]] name = "pywin32" version = "306" @@ -9393,7 +9418,7 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\ cffi = ["cffi (>=1.11)"] [extras] -all = ["aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "lancedb", "lark", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] +all = ["aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "lancedb", "lark", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"] azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"] cohere = ["cohere"] embeddings = ["sentence-transformers"] @@ -9404,4 +9429,4 @@ qdrant = ["qdrant-client"] [metadata] lock-version = "2.0" python-versions = ">=3.8.1,<4.0" -content-hash = "f7ff48dfce65630ea5c67287e91d923be83b9d0d9dd68639afcbc29f5f6f9c5f" +content-hash = "2ef913e267f1a10beee9f97924dc38df89fbe8a1ddc0de0f6e6f04e272763823" diff --git a/pyproject.toml b/pyproject.toml index cb611944..c4c6f770 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -74,6 +74,7 @@ azure-cosmos = {version="^4.4.0b1", optional=true} lark = {version="^1.1.5", optional=true} lancedb = {version = "^0.1", optional = true} pexpect = {version = "^4.8.0", optional = true} +pyvespa = {version = "^0.33.0", optional = true} [tool.poetry.group.docs.dependencies] autodoc_pydantic = "^1.8.0" @@ -152,7 +153,7 @@ openai = ["openai"] cohere = ["cohere"] embeddings = ["sentence-transformers"] azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"] -all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect"] +all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa"] [tool.ruff] select = [