Harrison/vespa (#3761)

Co-authored-by: Lester Solbakken <lesters@users.noreply.github.com>
This commit is contained in:
Harrison Chase 2023-04-28 19:48:43 -07:00 committed by GitHub
parent ee20b3e0d0
commit c55ba43093
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 204 additions and 8 deletions

View File

@ -0,0 +1,124 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "ce0f17b9",
"metadata": {},
"source": [
"# Vespa retriever\n",
"\n",
"This notebook shows how to use Vespa.ai as a LangChain retriever.\n",
"Vespa.ai is a platform for highly efficient structured text and vector search.\n",
"Please refer to [Vespa.ai](https://vespa.ai) for more information.\n",
"\n",
"In order to create a retriever, we use [pyvespa](https://pyvespa.readthedocs.io/en/latest/index.html) to\n",
"create a connection a Vespa service."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "c10dd962",
"metadata": {},
"outputs": [],
"source": [
"from vespa.application import Vespa\n",
"\n",
"vespa_app = Vespa(url=\"https://doc-search.vespa.oath.cloud\")"
]
},
{
"cell_type": "markdown",
"id": "3df4ce53",
"metadata": {},
"source": [
"This creates a connection to a Vespa service, here the Vespa documentation search service.\n",
"Using pyvespa, you can also connect to a\n",
"[Vespa Cloud instance](https://pyvespa.readthedocs.io/en/latest/deploy-vespa-cloud.html)\n",
"or a local\n",
"[Docker instance](https://pyvespa.readthedocs.io/en/latest/deploy-docker.html).\n",
"\n",
"\n",
"After connecting to the service, you can set up the retriever:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7ccca1f4",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"from langchain.retrievers.vespa_retriever import VespaRetriever\n",
"\n",
"vespa_query_body = {\n",
" \"yql\": \"select content from paragraph where userQuery()\",\n",
" \"hits\": 5,\n",
" \"ranking\": \"documentation\",\n",
" \"locale\": \"en-us\"\n",
"}\n",
"vespa_content_field = \"content\"\n",
"retriever = VespaRetriever(vespa_app, vespa_query_body, vespa_content_field)"
]
},
{
"cell_type": "markdown",
"id": "1e7e34e1",
"metadata": {
"pycharm": {
"name": "#%% md\n"
}
},
"source": [
"This sets up a LangChain retriever that fetches documents from the Vespa application.\n",
"Here, up to 5 results are retrieved from the `content` field in the `paragraph` document type,\n",
"using `doumentation` as the ranking method. The `userQuery()` is replaced with the actual query\n",
"passed from LangChain.\n",
"\n",
"Please refer to the [pyvespa documentation](https://pyvespa.readthedocs.io/en/latest/getting-started-pyvespa.html#Query)\n",
"for more information.\n",
"\n",
"Now you can return the results and continue using the results in LangChain."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "f47a2bfe",
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"retriever.get_relevant_documents(\"what is vespa?\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.5"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -485,7 +485,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
"version": "3.9.1"
}
},
"nbformat": 4,

View File

@ -841,7 +841,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
"version": "3.9.1"
}
},
"nbformat": 4,

View File

@ -410,7 +410,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
"version": "3.9.1"
}
},
"nbformat": 4,

View File

@ -10,6 +10,7 @@ from langchain.retrievers.tfidf import TFIDFRetriever
from langchain.retrievers.time_weighted_retriever import (
TimeWeightedVectorStoreRetriever,
)
from langchain.retrievers.vespa_retriever import VespaRetriever
from langchain.retrievers.weaviate_hybrid_search import WeaviateHybridSearchRetriever
__all__ = [
@ -24,4 +25,5 @@ __all__ = [
"DataberryRetriever",
"TimeWeightedVectorStoreRetriever",
"SVMRetriever",
"VespaRetriever",
]

View File

@ -0,0 +1,44 @@
"""Wrapper for retrieving documents from Vespa."""
from __future__ import annotations
import json
from typing import TYPE_CHECKING, List
from langchain.schema import BaseRetriever, Document
if TYPE_CHECKING:
from vespa.application import Vespa
class VespaRetriever(BaseRetriever):
def __init__(self, app: Vespa, body: dict, content_field: str):
self._application = app
self._query_body = body
self._content_field = content_field
def get_relevant_documents(self, query: str) -> List[Document]:
body = self._query_body.copy()
body["query"] = query
response = self._application.query(body)
if not str(response.status_code).startswith("2"):
raise RuntimeError(
"Could not retrieve data from Vespa. Error code: {}".format(
response.status_code
)
)
root = response.json["root"]
if "errors" in root:
raise RuntimeError(json.dumps(root["errors"]))
hits = []
for child in response.hits:
page_content = child["fields"][self._content_field]
metadata = {"id": child["id"]}
hits.append(Document(page_content=page_content, metadata=metadata))
return hits
async def aget_relevant_documents(self, query: str) -> List[Document]:
raise NotImplementedError

33
poetry.lock generated
View File

@ -5213,7 +5213,7 @@ test = ["mock", "pytest", "pytest-coverage", "typer-cli"]
name = "pexpect"
version = "4.8.0"
description = "Pexpect allows easy control of interactive console applications."
category = "dev"
category = "main"
optional = false
python-versions = "*"
files = [
@ -5739,7 +5739,7 @@ files = [
name = "ptyprocess"
version = "0.7.0"
description = "Run a subprocess in a pseudo terminal"
category = "dev"
category = "main"
optional = false
python-versions = "*"
files = [
@ -6392,6 +6392,31 @@ files = [
{file = "pytz-2023.3.tar.gz", hash = "sha256:1d8ce29db189191fb55338ee6d0387d82ab59f3d00eac103412d64e0ebd0c588"},
]
[[package]]
name = "pyvespa"
version = "0.33.0"
description = "Python API for vespa.ai"
category = "main"
optional = true
python-versions = ">=3.6"
files = [
{file = "pyvespa-0.33.0-py3-none-any.whl", hash = "sha256:2681910b3ac5f0259a9e41e6e2649caba2801e836b4c295cc2e48ab25b09672c"},
{file = "pyvespa-0.33.0.tar.gz", hash = "sha256:be3da9022276555b6b25c40b6e846db6e9dbf617486001ba92235ccfab6c9353"},
]
[package.dependencies]
aiohttp = "*"
cryptography = "*"
docker = "*"
jinja2 = "*"
pandas = "*"
requests = "*"
tenacity = "*"
[package.extras]
full = ["keras-tuner", "onnxruntime", "tensorflow", "tensorflow-ranking", "torch (<1.13)", "transformers"]
ml = ["keras-tuner", "tensorflow", "tensorflow-ranking", "torch (<1.13)", "transformers"]
[[package]]
name = "pywin32"
version = "306"
@ -9393,7 +9418,7 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
cffi = ["cffi (>=1.11)"]
[extras]
all = ["aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "lancedb", "lark", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
all = ["aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "lancedb", "lark", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"]
cohere = ["cohere"]
embeddings = ["sentence-transformers"]
@ -9404,4 +9429,4 @@ qdrant = ["qdrant-client"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.8.1,<4.0"
content-hash = "f7ff48dfce65630ea5c67287e91d923be83b9d0d9dd68639afcbc29f5f6f9c5f"
content-hash = "2ef913e267f1a10beee9f97924dc38df89fbe8a1ddc0de0f6e6f04e272763823"

View File

@ -74,6 +74,7 @@ azure-cosmos = {version="^4.4.0b1", optional=true}
lark = {version="^1.1.5", optional=true}
lancedb = {version = "^0.1", optional = true}
pexpect = {version = "^4.8.0", optional = true}
pyvespa = {version = "^0.33.0", optional = true}
[tool.poetry.group.docs.dependencies]
autodoc_pydantic = "^1.8.0"
@ -152,7 +153,7 @@ openai = ["openai"]
cohere = ["cohere"]
embeddings = ["sentence-transformers"]
azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"]
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect"]
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa"]
[tool.ruff]
select = [