mirror of https://github.com/hwchase17/langchain
Add youdotcom retriever (#11304)
--------- Co-authored-by: Bagatur <baskaryan@gmail.com>pull/10662/head^2
parent
1655ff2ded
commit
9903a70379
@ -0,0 +1,62 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "47828a7a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Using the You.com Retriever\n",
|
||||
"The retriever from You.com is good for retrieving lots of text. We return multiple of the best text snippets per URL we find to be relevant.\n",
|
||||
"\n",
|
||||
"First you just need to initialize the retriever"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "a90d61d4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.retrievers.you_retriever import YouRetriever\n",
|
||||
"from langchain.chains import RetrievalQA\n",
|
||||
"from langchain.llms import OpenAI\n",
|
||||
"\n",
|
||||
"yr = YouRetriever()\n",
|
||||
"qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type=\"map_reduce\", retriever=yr)\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "4a223f2f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"query = \"what starting ohio state quarterback most recently went their entire college career without beating Michigan?\"\n",
|
||||
"qa.run(query)"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.17"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -0,0 +1,46 @@
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from langchain.callbacks.manager import CallbackManagerForRetrieverRun
|
||||
from langchain.pydantic_v1 import root_validator
|
||||
from langchain.schema import BaseRetriever, Document
|
||||
from langchain.utils import get_from_dict_or_env
|
||||
|
||||
|
||||
class YouRetriever(BaseRetriever):
|
||||
"""`You` retriever that uses You.com's search API.
|
||||
|
||||
To connect to the You.com api requires an API key which
|
||||
you can get by emailing api@you.com.
|
||||
You can check out our docs at https://documentation.you.com.
|
||||
|
||||
You need to set the environment variable `YDC_API_KEY` for retriever to operate.
|
||||
"""
|
||||
|
||||
ydc_api_key: str
|
||||
|
||||
@root_validator(pre=True)
|
||||
def validate_client(
|
||||
cls,
|
||||
values: Dict[str, Any],
|
||||
) -> Dict[str, Any]:
|
||||
values["ydc_api_key"] = get_from_dict_or_env(
|
||||
values, "ydc_api_key", "YDC_API_KEY"
|
||||
)
|
||||
return values
|
||||
|
||||
def _get_relevant_documents(
|
||||
self, query: str, *, run_manager: CallbackManagerForRetrieverRun
|
||||
) -> List[Document]:
|
||||
import requests
|
||||
|
||||
headers = {"X-API-Key": self.ydc_api_key}
|
||||
results = requests.get(
|
||||
f"https://api.ydc-index.io/search?query={query}",
|
||||
headers=headers,
|
||||
).json()
|
||||
|
||||
docs = []
|
||||
for hit in results["hits"]:
|
||||
for snippet in hit["snippets"]:
|
||||
docs.append(Document(page_content=snippet))
|
||||
return docs
|
@ -0,0 +1,16 @@
|
||||
import os
|
||||
|
||||
from langchain.retrievers.you import YouRetriever
|
||||
|
||||
|
||||
class TestYouRetriever:
|
||||
@classmethod
|
||||
def setup_class(cls) -> None:
|
||||
if not os.getenv("YDC_API_KEY"):
|
||||
raise ValueError("YDC_API_KEY environment variable is not set")
|
||||
|
||||
def test_get_relevant_documents(self) -> None:
|
||||
retriever = YouRetriever()
|
||||
actual = retriever.get_relevant_documents("test")
|
||||
|
||||
assert len(actual) > 0
|
@ -0,0 +1,26 @@
|
||||
import json
|
||||
import os
|
||||
from unittest import mock
|
||||
|
||||
from requests import Response
|
||||
|
||||
from langchain.retrievers.you import YouRetriever
|
||||
from langchain.schema import Document
|
||||
|
||||
|
||||
class TestYouRetriever:
|
||||
def test_get_relevant_documents(self) -> None:
|
||||
os.environ["YDC_API_KEY"] = "MOCK KEY!"
|
||||
retriever = YouRetriever()
|
||||
|
||||
with mock.patch("requests.get") as mock_get:
|
||||
fixture = {"hits": [{"snippets": ["yo"]}, {"snippets": ["bird up"]}]}
|
||||
response = Response()
|
||||
response._content = bytes(json.dumps(fixture).encode("utf-8"))
|
||||
mock_get.return_value = response
|
||||
|
||||
actual = retriever.get_relevant_documents("test")
|
||||
assert actual == [
|
||||
Document(page_content="yo"),
|
||||
Document(page_content="bird up"),
|
||||
]
|
Loading…
Reference in New Issue