forked from Archives/langchain
Harrison/vespa (#3761)
Co-authored-by: Lester Solbakken <lesters@users.noreply.github.com>
This commit is contained in:
parent
ee20b3e0d0
commit
c55ba43093
124
docs/modules/indexes/retrievers/examples/vespa_retriever.ipynb
Normal file
124
docs/modules/indexes/retrievers/examples/vespa_retriever.ipynb
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "ce0f17b9",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Vespa retriever\n",
|
||||||
|
"\n",
|
||||||
|
"This notebook shows how to use Vespa.ai as a LangChain retriever.\n",
|
||||||
|
"Vespa.ai is a platform for highly efficient structured text and vector search.\n",
|
||||||
|
"Please refer to [Vespa.ai](https://vespa.ai) for more information.\n",
|
||||||
|
"\n",
|
||||||
|
"In order to create a retriever, we use [pyvespa](https://pyvespa.readthedocs.io/en/latest/index.html) to\n",
|
||||||
|
"create a connection a Vespa service."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"id": "c10dd962",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from vespa.application import Vespa\n",
|
||||||
|
"\n",
|
||||||
|
"vespa_app = Vespa(url=\"https://doc-search.vespa.oath.cloud\")"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "3df4ce53",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"This creates a connection to a Vespa service, here the Vespa documentation search service.\n",
|
||||||
|
"Using pyvespa, you can also connect to a\n",
|
||||||
|
"[Vespa Cloud instance](https://pyvespa.readthedocs.io/en/latest/deploy-vespa-cloud.html)\n",
|
||||||
|
"or a local\n",
|
||||||
|
"[Docker instance](https://pyvespa.readthedocs.io/en/latest/deploy-docker.html).\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
"After connecting to the service, you can set up the retriever:"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "7ccca1f4",
|
||||||
|
"metadata": {
|
||||||
|
"pycharm": {
|
||||||
|
"name": "#%%\n"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.retrievers.vespa_retriever import VespaRetriever\n",
|
||||||
|
"\n",
|
||||||
|
"vespa_query_body = {\n",
|
||||||
|
" \"yql\": \"select content from paragraph where userQuery()\",\n",
|
||||||
|
" \"hits\": 5,\n",
|
||||||
|
" \"ranking\": \"documentation\",\n",
|
||||||
|
" \"locale\": \"en-us\"\n",
|
||||||
|
"}\n",
|
||||||
|
"vespa_content_field = \"content\"\n",
|
||||||
|
"retriever = VespaRetriever(vespa_app, vespa_query_body, vespa_content_field)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "1e7e34e1",
|
||||||
|
"metadata": {
|
||||||
|
"pycharm": {
|
||||||
|
"name": "#%% md\n"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"source": [
|
||||||
|
"This sets up a LangChain retriever that fetches documents from the Vespa application.\n",
|
||||||
|
"Here, up to 5 results are retrieved from the `content` field in the `paragraph` document type,\n",
|
||||||
|
"using `doumentation` as the ranking method. The `userQuery()` is replaced with the actual query\n",
|
||||||
|
"passed from LangChain.\n",
|
||||||
|
"\n",
|
||||||
|
"Please refer to the [pyvespa documentation](https://pyvespa.readthedocs.io/en/latest/getting-started-pyvespa.html#Query)\n",
|
||||||
|
"for more information.\n",
|
||||||
|
"\n",
|
||||||
|
"Now you can return the results and continue using the results in LangChain."
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"id": "f47a2bfe",
|
||||||
|
"metadata": {
|
||||||
|
"pycharm": {
|
||||||
|
"name": "#%%\n"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"retriever.get_relevant_documents(\"what is vespa?\")"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.5"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -485,7 +485,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.9.16"
|
"version": "3.9.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -841,7 +841,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.9.16"
|
"version": "3.9.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -410,7 +410,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.9.16"
|
"version": "3.9.1"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -10,6 +10,7 @@ from langchain.retrievers.tfidf import TFIDFRetriever
|
|||||||
from langchain.retrievers.time_weighted_retriever import (
|
from langchain.retrievers.time_weighted_retriever import (
|
||||||
TimeWeightedVectorStoreRetriever,
|
TimeWeightedVectorStoreRetriever,
|
||||||
)
|
)
|
||||||
|
from langchain.retrievers.vespa_retriever import VespaRetriever
|
||||||
from langchain.retrievers.weaviate_hybrid_search import WeaviateHybridSearchRetriever
|
from langchain.retrievers.weaviate_hybrid_search import WeaviateHybridSearchRetriever
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@ -24,4 +25,5 @@ __all__ = [
|
|||||||
"DataberryRetriever",
|
"DataberryRetriever",
|
||||||
"TimeWeightedVectorStoreRetriever",
|
"TimeWeightedVectorStoreRetriever",
|
||||||
"SVMRetriever",
|
"SVMRetriever",
|
||||||
|
"VespaRetriever",
|
||||||
]
|
]
|
||||||
|
44
langchain/retrievers/vespa_retriever.py
Normal file
44
langchain/retrievers/vespa_retriever.py
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
"""Wrapper for retrieving documents from Vespa."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import json
|
||||||
|
from typing import TYPE_CHECKING, List
|
||||||
|
|
||||||
|
from langchain.schema import BaseRetriever, Document
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from vespa.application import Vespa
|
||||||
|
|
||||||
|
|
||||||
|
class VespaRetriever(BaseRetriever):
|
||||||
|
def __init__(self, app: Vespa, body: dict, content_field: str):
|
||||||
|
self._application = app
|
||||||
|
self._query_body = body
|
||||||
|
self._content_field = content_field
|
||||||
|
|
||||||
|
def get_relevant_documents(self, query: str) -> List[Document]:
|
||||||
|
body = self._query_body.copy()
|
||||||
|
body["query"] = query
|
||||||
|
response = self._application.query(body)
|
||||||
|
|
||||||
|
if not str(response.status_code).startswith("2"):
|
||||||
|
raise RuntimeError(
|
||||||
|
"Could not retrieve data from Vespa. Error code: {}".format(
|
||||||
|
response.status_code
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
root = response.json["root"]
|
||||||
|
if "errors" in root:
|
||||||
|
raise RuntimeError(json.dumps(root["errors"]))
|
||||||
|
|
||||||
|
hits = []
|
||||||
|
for child in response.hits:
|
||||||
|
page_content = child["fields"][self._content_field]
|
||||||
|
metadata = {"id": child["id"]}
|
||||||
|
hits.append(Document(page_content=page_content, metadata=metadata))
|
||||||
|
return hits
|
||||||
|
|
||||||
|
async def aget_relevant_documents(self, query: str) -> List[Document]:
|
||||||
|
raise NotImplementedError
|
33
poetry.lock
generated
33
poetry.lock
generated
@ -5213,7 +5213,7 @@ test = ["mock", "pytest", "pytest-coverage", "typer-cli"]
|
|||||||
name = "pexpect"
|
name = "pexpect"
|
||||||
version = "4.8.0"
|
version = "4.8.0"
|
||||||
description = "Pexpect allows easy control of interactive console applications."
|
description = "Pexpect allows easy control of interactive console applications."
|
||||||
category = "dev"
|
category = "main"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "*"
|
python-versions = "*"
|
||||||
files = [
|
files = [
|
||||||
@ -5739,7 +5739,7 @@ files = [
|
|||||||
name = "ptyprocess"
|
name = "ptyprocess"
|
||||||
version = "0.7.0"
|
version = "0.7.0"
|
||||||
description = "Run a subprocess in a pseudo terminal"
|
description = "Run a subprocess in a pseudo terminal"
|
||||||
category = "dev"
|
category = "main"
|
||||||
optional = false
|
optional = false
|
||||||
python-versions = "*"
|
python-versions = "*"
|
||||||
files = [
|
files = [
|
||||||
@ -6392,6 +6392,31 @@ files = [
|
|||||||
{file = "pytz-2023.3.tar.gz", hash = "sha256:1d8ce29db189191fb55338ee6d0387d82ab59f3d00eac103412d64e0ebd0c588"},
|
{file = "pytz-2023.3.tar.gz", hash = "sha256:1d8ce29db189191fb55338ee6d0387d82ab59f3d00eac103412d64e0ebd0c588"},
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pyvespa"
|
||||||
|
version = "0.33.0"
|
||||||
|
description = "Python API for vespa.ai"
|
||||||
|
category = "main"
|
||||||
|
optional = true
|
||||||
|
python-versions = ">=3.6"
|
||||||
|
files = [
|
||||||
|
{file = "pyvespa-0.33.0-py3-none-any.whl", hash = "sha256:2681910b3ac5f0259a9e41e6e2649caba2801e836b4c295cc2e48ab25b09672c"},
|
||||||
|
{file = "pyvespa-0.33.0.tar.gz", hash = "sha256:be3da9022276555b6b25c40b6e846db6e9dbf617486001ba92235ccfab6c9353"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
aiohttp = "*"
|
||||||
|
cryptography = "*"
|
||||||
|
docker = "*"
|
||||||
|
jinja2 = "*"
|
||||||
|
pandas = "*"
|
||||||
|
requests = "*"
|
||||||
|
tenacity = "*"
|
||||||
|
|
||||||
|
[package.extras]
|
||||||
|
full = ["keras-tuner", "onnxruntime", "tensorflow", "tensorflow-ranking", "torch (<1.13)", "transformers"]
|
||||||
|
ml = ["keras-tuner", "tensorflow", "tensorflow-ranking", "torch (<1.13)", "transformers"]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pywin32"
|
name = "pywin32"
|
||||||
version = "306"
|
version = "306"
|
||||||
@ -9393,7 +9418,7 @@ cffi = {version = ">=1.11", markers = "platform_python_implementation == \"PyPy\
|
|||||||
cffi = ["cffi (>=1.11)"]
|
cffi = ["cffi (>=1.11)"]
|
||||||
|
|
||||||
[extras]
|
[extras]
|
||||||
all = ["aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "lancedb", "lark", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
|
all = ["aleph-alpha-client", "anthropic", "arxiv", "atlassian-python-api", "azure-cosmos", "azure-identity", "beautifulsoup4", "clickhouse-connect", "cohere", "deeplake", "duckduckgo-search", "elasticsearch", "faiss-cpu", "google-api-python-client", "google-search-results", "gptcache", "html2text", "huggingface_hub", "jina", "jinja2", "lancedb", "lark", "manifest-ml", "networkx", "nlpcloud", "nltk", "nomic", "openai", "opensearch-py", "pexpect", "pgvector", "pinecone-client", "pinecone-text", "psycopg2-binary", "pyowm", "pypdf", "pytesseract", "qdrant-client", "redis", "sentence-transformers", "spacy", "tensorflow-text", "tiktoken", "torch", "transformers", "weaviate-client", "wikipedia", "wolframalpha"]
|
||||||
azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"]
|
azure = ["azure-core", "azure-cosmos", "azure-identity", "openai"]
|
||||||
cohere = ["cohere"]
|
cohere = ["cohere"]
|
||||||
embeddings = ["sentence-transformers"]
|
embeddings = ["sentence-transformers"]
|
||||||
@ -9404,4 +9429,4 @@ qdrant = ["qdrant-client"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = ">=3.8.1,<4.0"
|
python-versions = ">=3.8.1,<4.0"
|
||||||
content-hash = "f7ff48dfce65630ea5c67287e91d923be83b9d0d9dd68639afcbc29f5f6f9c5f"
|
content-hash = "2ef913e267f1a10beee9f97924dc38df89fbe8a1ddc0de0f6e6f04e272763823"
|
||||||
|
@ -74,6 +74,7 @@ azure-cosmos = {version="^4.4.0b1", optional=true}
|
|||||||
lark = {version="^1.1.5", optional=true}
|
lark = {version="^1.1.5", optional=true}
|
||||||
lancedb = {version = "^0.1", optional = true}
|
lancedb = {version = "^0.1", optional = true}
|
||||||
pexpect = {version = "^4.8.0", optional = true}
|
pexpect = {version = "^4.8.0", optional = true}
|
||||||
|
pyvespa = {version = "^0.33.0", optional = true}
|
||||||
|
|
||||||
[tool.poetry.group.docs.dependencies]
|
[tool.poetry.group.docs.dependencies]
|
||||||
autodoc_pydantic = "^1.8.0"
|
autodoc_pydantic = "^1.8.0"
|
||||||
@ -152,7 +153,7 @@ openai = ["openai"]
|
|||||||
cohere = ["cohere"]
|
cohere = ["cohere"]
|
||||||
embeddings = ["sentence-transformers"]
|
embeddings = ["sentence-transformers"]
|
||||||
azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"]
|
azure = ["azure-identity", "azure-cosmos", "openai", "azure-core"]
|
||||||
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect"]
|
all = ["anthropic", "cohere", "openai", "nlpcloud", "huggingface_hub", "jina", "manifest-ml", "elasticsearch", "opensearch-py", "google-search-results", "faiss-cpu", "sentence-transformers", "transformers", "spacy", "nltk", "wikipedia", "beautifulsoup4", "tiktoken", "torch", "jinja2", "pinecone-client", "pinecone-text", "weaviate-client", "redis", "google-api-python-client", "wolframalpha", "qdrant-client", "tensorflow-text", "pypdf", "networkx", "nomic", "aleph-alpha-client", "deeplake", "pgvector", "psycopg2-binary", "boto3", "pyowm", "pytesseract", "html2text", "atlassian-python-api", "gptcache", "duckduckgo-search", "arxiv", "azure-identity", "clickhouse-connect", "azure-cosmos", "lancedb", "lark", "pexpect", "pyvespa"]
|
||||||
|
|
||||||
[tool.ruff]
|
[tool.ruff]
|
||||||
select = [
|
select = [
|
||||||
|
Loading…
Reference in New Issue
Block a user