From 9903a70379216ac166ea5ae583353894fe534bc6 Mon Sep 17 00:00:00 2001 From: mrbean <43734688+sam-h-bean@users.noreply.github.com> Date: Thu, 5 Oct 2023 16:48:11 -0400 Subject: [PATCH] Add youdotcom retriever (#11304) --------- Co-authored-by: Bagatur --- .../retrievers/you-retriever.ipynb | 62 +++++++++++++++++++ libs/langchain/langchain/retrievers/you.py | 46 ++++++++++++++ .../integration_tests/retrievers/test_you.py | 16 +++++ .../tests/unit_tests/retrievers/test_you.py | 26 ++++++++ 4 files changed, 150 insertions(+) create mode 100644 docs/extras/integrations/retrievers/you-retriever.ipynb create mode 100644 libs/langchain/langchain/retrievers/you.py create mode 100644 libs/langchain/tests/integration_tests/retrievers/test_you.py create mode 100644 libs/langchain/tests/unit_tests/retrievers/test_you.py diff --git a/docs/extras/integrations/retrievers/you-retriever.ipynb b/docs/extras/integrations/retrievers/you-retriever.ipynb new file mode 100644 index 0000000000..208ae70adb --- /dev/null +++ b/docs/extras/integrations/retrievers/you-retriever.ipynb @@ -0,0 +1,62 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "47828a7a", + "metadata": {}, + "source": [ + "## Using the You.com Retriever\n", + "The retriever from You.com is good for retrieving lots of text. We return multiple of the best text snippets per URL we find to be relevant.\n", + "\n", + "First you just need to initialize the retriever" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a90d61d4", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.retrievers.you_retriever import YouRetriever\n", + "from langchain.chains import RetrievalQA\n", + "from langchain.llms import OpenAI\n", + "\n", + "yr = YouRetriever()\n", + "qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type=\"map_reduce\", retriever=yr)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4a223f2f", + "metadata": {}, + "outputs": [], + "source": [ + "query = \"what starting ohio state quarterback most recently went their entire college career without beating Michigan?\"\n", + "qa.run(query)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.17" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/langchain/langchain/retrievers/you.py b/libs/langchain/langchain/retrievers/you.py new file mode 100644 index 0000000000..3c281c3de2 --- /dev/null +++ b/libs/langchain/langchain/retrievers/you.py @@ -0,0 +1,46 @@ +from typing import Any, Dict, List + +from langchain.callbacks.manager import CallbackManagerForRetrieverRun +from langchain.pydantic_v1 import root_validator +from langchain.schema import BaseRetriever, Document +from langchain.utils import get_from_dict_or_env + + +class YouRetriever(BaseRetriever): + """`You` retriever that uses You.com's search API. + + To connect to the You.com api requires an API key which + you can get by emailing api@you.com. + You can check out our docs at https://documentation.you.com. + + You need to set the environment variable `YDC_API_KEY` for retriever to operate. + """ + + ydc_api_key: str + + @root_validator(pre=True) + def validate_client( + cls, + values: Dict[str, Any], + ) -> Dict[str, Any]: + values["ydc_api_key"] = get_from_dict_or_env( + values, "ydc_api_key", "YDC_API_KEY" + ) + return values + + def _get_relevant_documents( + self, query: str, *, run_manager: CallbackManagerForRetrieverRun + ) -> List[Document]: + import requests + + headers = {"X-API-Key": self.ydc_api_key} + results = requests.get( + f"https://api.ydc-index.io/search?query={query}", + headers=headers, + ).json() + + docs = [] + for hit in results["hits"]: + for snippet in hit["snippets"]: + docs.append(Document(page_content=snippet)) + return docs diff --git a/libs/langchain/tests/integration_tests/retrievers/test_you.py b/libs/langchain/tests/integration_tests/retrievers/test_you.py new file mode 100644 index 0000000000..6d1741adea --- /dev/null +++ b/libs/langchain/tests/integration_tests/retrievers/test_you.py @@ -0,0 +1,16 @@ +import os + +from langchain.retrievers.you import YouRetriever + + +class TestYouRetriever: + @classmethod + def setup_class(cls) -> None: + if not os.getenv("YDC_API_KEY"): + raise ValueError("YDC_API_KEY environment variable is not set") + + def test_get_relevant_documents(self) -> None: + retriever = YouRetriever() + actual = retriever.get_relevant_documents("test") + + assert len(actual) > 0 diff --git a/libs/langchain/tests/unit_tests/retrievers/test_you.py b/libs/langchain/tests/unit_tests/retrievers/test_you.py new file mode 100644 index 0000000000..54ab1cb8e2 --- /dev/null +++ b/libs/langchain/tests/unit_tests/retrievers/test_you.py @@ -0,0 +1,26 @@ +import json +import os +from unittest import mock + +from requests import Response + +from langchain.retrievers.you import YouRetriever +from langchain.schema import Document + + +class TestYouRetriever: + def test_get_relevant_documents(self) -> None: + os.environ["YDC_API_KEY"] = "MOCK KEY!" + retriever = YouRetriever() + + with mock.patch("requests.get") as mock_get: + fixture = {"hits": [{"snippets": ["yo"]}, {"snippets": ["bird up"]}]} + response = Response() + response._content = bytes(json.dumps(fixture).encode("utf-8")) + mock_get.return_value = response + + actual = retriever.get_relevant_documents("test") + assert actual == [ + Document(page_content="yo"), + Document(page_content="bird up"), + ]