From 200be43da60001f62602ccf5d1de5f742c296af1 Mon Sep 17 00:00:00 2001 From: Leonid Ganeline Date: Sun, 2 Jul 2023 19:01:24 -0700 Subject: [PATCH] added `Brave Search` document_loader (#6989) - Added `Brave Search` document loader. - Refactored BraveSearch wrapper - Added a Jupyter Notebook example - Added `Ecosystem/Integrations` BraveSearch page Please review: - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev --- .../ecosystem/integrations/brave_search.mdx | 36 ++++ .../tools/integrations/brave_search.ipynb | 8 +- .../integrations/brave_search.ipynb | 164 ++++++++++++++++++ langchain/document_loaders/__init__.py | 2 + langchain/document_loaders/brave_search.py | 32 ++++ langchain/utilities/brave_search.py | 60 +++++-- 6 files changed, 282 insertions(+), 20 deletions(-) create mode 100644 docs/extras/ecosystem/integrations/brave_search.mdx create mode 100644 docs/extras/modules/data_connection/document_loaders/integrations/brave_search.ipynb create mode 100644 langchain/document_loaders/brave_search.py diff --git a/docs/extras/ecosystem/integrations/brave_search.mdx b/docs/extras/ecosystem/integrations/brave_search.mdx new file mode 100644 index 0000000000..2c22a42856 --- /dev/null +++ b/docs/extras/ecosystem/integrations/brave_search.mdx @@ -0,0 +1,36 @@ +# Brave Search + + +>[Brave Search](https://en.wikipedia.org/wiki/Brave_Search) is a search engine developed by Brave Software. +> - `Brave Search` uses its own web index. As of May 2022, it covered over 10 billion pages and was used to serve 92% +> of search results without relying on any third-parties, with the remainder being retrieved +> server-side from the Bing API or (on an opt-in basis) client-side from Google. According +> to Brave, the index was kept "intentionally smaller than that of Google or Bing" in order to +> help avoid spam and other low-quality content, with the disadvantage that "Brave Search is +> not yet as good as Google in recovering long-tail queries." +>- `Brave Search Premium`: As of April 2023 Brave Search is an ad-free website, but it will +> eventually switch to a new model that will include ads and premium users will get an ad-free experience. +> User data including IP addresses won't be collected from its users by default. A premium account +> will be required for opt-in data-collection. + + +## Installation and Setup + +To get access to the Brave Search API, you need to [create an account and get an API key](https://api.search.brave.com/app/dashboard). + + +## Document Loader + +See a [usage example](/docs/modules/data_connection/document_loaders/integrations/brave_search.html). + +```python +from langchain.document_loaders import BraveSearchLoader +``` + +## Tool + +See a [usage example](/docs/modules/agents/tools/integrations/brave_search.html). + +```python +from langchain.tools import BraveSearch +``` diff --git a/docs/extras/modules/agents/tools/integrations/brave_search.ipynb b/docs/extras/modules/agents/tools/integrations/brave_search.ipynb index 322282a915..73c5df525c 100644 --- a/docs/extras/modules/agents/tools/integrations/brave_search.ipynb +++ b/docs/extras/modules/agents/tools/integrations/brave_search.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "a4c896e5", "metadata": {}, "outputs": [], @@ -27,7 +27,7 @@ "metadata": {}, "outputs": [], "source": [ - "api_key = \"...\"" + "api_key = \"BSAv1neIuQOsxqOyy0sEe_ie2zD_n_V\"" ] }, { @@ -49,7 +49,7 @@ { "data": { "text/plain": [ - "'[{\"title\": \"Barack Obama - Wikipedia\", \"link\": \"https://en.wikipedia.org/wiki/Barack_Obama\", \"snippet\": \"Outside of politics, Obama has published three bestselling books: Dreams from My Father (1995), The Audacity of Hope (2006) and A Promised Land (2020). Rankings by scholars and historians, in which he has been featured since 2010, place him in the middle to upper tier of American presidents.\"}, {\"title\": \"Obama\\'s Middle Name -- My Last Name -- is \\'Hussein.\\' So?\", \"link\": \"https://www.cair.com/cair_in_the_news/obamas-middle-name-my-last-name-is-hussein-so/\", \"snippet\": \"Many Americans understand that common names don\\\\u2019t only come in the form of a \\\\u201cSmith\\\\u201d or a \\\\u201cJohnson.\\\\u201d Perhaps, they have a neighbor, mechanic or teacher named Hussein. Or maybe they\\\\u2019ve seen fashion designer Hussein Chalayan in the pages of Vogue or recall King Hussein, our ally in the Middle East.\"}, {\"title\": \"What\\'s up with Obama\\'s middle name? - Quora\", \"link\": \"https://www.quora.com/Whats-up-with-Obamas-middle-name\", \"snippet\": \"Answer (1 of 15): A better question would be, \\\\u201cWhat\\\\u2019s up with Obama\\\\u2019s first name?\\\\u201d President Barack Hussein Obama\\\\u2019s father\\\\u2019s name was Barack Hussein Obama. He was named after his father. Hussein, Obama\\\\u2019s middle name, is a very common Arabic name, meaning "good," "handsome," or "beautiful."\"}]'" + "'[{\"title\": \"Obama\\'s Middle Name -- My Last Name -- is \\'Hussein.\\' So?\", \"link\": \"https://www.cair.com/cair_in_the_news/obamas-middle-name-my-last-name-is-hussein-so/\", \"snippet\": \"I wasn\\\\u2019t sure whether to laugh or cry a few days back listening to radio talk show host Bill Cunningham repeatedly scream Barack Obama\\\\u2019s middle name \\\\u2014 my last name \\\\u2014 as if he had anti-Muslim Tourette\\\\u2019s. \\\\u201cHussein,\\\\u201d Cunningham hissed like he was beckoning Satan when shouting the ...\"}, {\"title\": \"What\\'s up with Obama\\'s middle name? - Quora\", \"link\": \"https://www.quora.com/Whats-up-with-Obamas-middle-name\", \"snippet\": \"Answer (1 of 15): A better question would be, \\\\u201cWhat\\\\u2019s up with Obama\\\\u2019s first name?\\\\u201d President Barack Hussein Obama\\\\u2019s father\\\\u2019s name was Barack Hussein Obama. He was named after his father. Hussein, Obama\\\\u2019s middle name, is a very common Arabic name, meaning "good," "handsome," or ...\"}, {\"title\": \"Barack Obama | Biography, Parents, Education, Presidency, Books, ...\", \"link\": \"https://www.britannica.com/biography/Barack-Obama\", \"snippet\": \"Barack Obama, in full Barack Hussein Obama II, (born August 4, 1961, Honolulu, Hawaii, U.S.), 44th president of the United States (2009\\\\u201317) and the first African American to hold the office. Before winning the presidency, Obama represented Illinois in the U.S.\"}]'" ] }, "execution_count": 4, @@ -86,7 +86,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.1" + "version": "3.10.6" } }, "nbformat": 4, diff --git a/docs/extras/modules/data_connection/document_loaders/integrations/brave_search.ipynb b/docs/extras/modules/data_connection/document_loaders/integrations/brave_search.ipynb new file mode 100644 index 0000000000..db3106367a --- /dev/null +++ b/docs/extras/modules/data_connection/document_loaders/integrations/brave_search.ipynb @@ -0,0 +1,164 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "3dd292b1-9a73-4ea8-af19-5fa6e3c1a62a", + "metadata": {}, + "source": [ + "# Brave Search\n", + "\n", + "\n", + ">[Brave Search](https://en.wikipedia.org/wiki/Brave_Search) is a search engine developed by Brave Software.\n", + "> - `Brave Search` uses its own web index. As of May 2022, it covered over 10 billion pages and was used to serve 92% \n", + "> of search results without relying on any third-parties, with the remainder being retrieved \n", + "> server-side from the Bing API or (on an opt-in basis) client-side from Google. According \n", + "> to Brave, the index was kept \"intentionally smaller than that of Google or Bing\" in order to \n", + "> help avoid spam and other low-quality content, with the disadvantage that \"Brave Search is \n", + "> not yet as good as Google in recovering long-tail queries.\"\n", + ">- `Brave Search Premium`: As of April 2023 Brave Search is an ad-free website, but it will \n", + "> eventually switch to a new model that will include ads and premium users will get an ad-free experience.\n", + "> User data including IP addresses won't be collected from its users by default. A premium account \n", + "> will be required for opt-in data-collection.\n" + ] + }, + { + "cell_type": "markdown", + "id": "26f0888e-3f3e-4b82-ac4a-2df6feeccbe0", + "metadata": {}, + "source": [ + "## Installation and Setup\n", + "\n", + "To get access to the Brave Search API, you need to [create an account and get an API key](https://api.search.brave.com/app/dashboard).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d7d7be09-58bd-47d7-bf1b-33964564f777", + "metadata": {}, + "outputs": [], + "source": [ + "api_key = \"...\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3ac92df-6ff0-4dbb-b32b-a7dc140c48ef", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import BraveSearchLoader" + ] + }, + { + "cell_type": "markdown", + "id": "7f483caf-58ef-4138-975a-5b783559dc1b", + "metadata": {}, + "source": [ + "## Example" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "766634cf-3bc7-4656-939a-cafa218807a6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "loader = BraveSearchLoader(query=\"obama middle name\", api_key=api_key, search_kwargs={\"count\": 3})\n", + "docs = loader.load()\n", + "len(docs)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "f1fcc9f1-cbdc-46b3-89d3-80311d557dc6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'title': \"Obama's Middle Name -- My Last Name -- is 'Hussein.' So?\",\n", + " 'link': 'https://www.cair.com/cair_in_the_news/obamas-middle-name-my-last-name-is-hussein-so/'},\n", + " {'title': \"What's up with Obama's middle name? - Quora\",\n", + " 'link': 'https://www.quora.com/Whats-up-with-Obamas-middle-name'},\n", + " {'title': 'Barack Obama | Biography, Parents, Education, Presidency, Books, ...',\n", + " 'link': 'https://www.britannica.com/biography/Barack-Obama'}]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[doc.metadata for doc in docs]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "601bfd77-03d3-468e-843f-2523d5e215bd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['I wasn’t sure whether to laugh or cry a few days back listening to radio talk show host Bill Cunningham repeatedly scream Barack Obamas middle name — my last name — as if he had anti-Muslim Tourette’s. “Hussein,” Cunningham hissed like he was beckoning Satan when shouting the ...',\n", + " 'Answer (1 of 15): A better question would be, “What’s up with Obama’s first name?” President Barack Hussein Obama’s father’s name was Barack Hussein Obama. He was named after his father. Hussein, Obamas middle name, is a very common Arabic name, meaning "good," "handsome," or ...',\n", + " 'Barack Obama, in full Barack Hussein Obama II, (born August 4, 1961, Honolulu, Hawaii, U.S.), 44th president of the United States (2009–17) and the first African American to hold the office. Before winning the presidency, Obama represented Illinois in the U.S.']" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "[doc.page_content for doc in docs]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74a6ba54-9e48-4bac-ab9b-03eabd19eb81", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/langchain/document_loaders/__init__.py b/langchain/document_loaders/__init__.py index 8fcecd3abf..9425767782 100644 --- a/langchain/document_loaders/__init__.py +++ b/langchain/document_loaders/__init__.py @@ -23,6 +23,7 @@ from langchain.document_loaders.blob_loaders import ( YoutubeAudioLoader, ) from langchain.document_loaders.blockchain import BlockchainDocumentLoader +from langchain.document_loaders.brave_search import BraveSearchLoader from langchain.document_loaders.chatgpt import ChatGPTLoader from langchain.document_loaders.college_confidential import CollegeConfidentialLoader from langchain.document_loaders.confluence import ConfluenceLoader @@ -168,6 +169,7 @@ __all__ = [ "Blob", "BlobLoader", "BlockchainDocumentLoader", + "BraveSearchLoader", "CSVLoader", "ChatGPTLoader", "CoNLLULoader", diff --git a/langchain/document_loaders/brave_search.py b/langchain/document_loaders/brave_search.py new file mode 100644 index 0000000000..2887256e1e --- /dev/null +++ b/langchain/document_loaders/brave_search.py @@ -0,0 +1,32 @@ +from typing import Iterator, List, Optional + +from langchain.docstore.document import Document +from langchain.document_loaders.base import BaseLoader +from langchain.utilities.brave_search import BraveSearchWrapper + + +class BraveSearchLoader(BaseLoader): + """Loads a query result from Brave Search engine into a list of Documents.""" + + def __init__(self, query: str, api_key: str, search_kwargs: Optional[dict] = None): + """Initializes the BraveLoader. + + Args: + query: The query to search for. + api_key: The API key to use. + search_kwargs: The search kwargs to use. + """ + self.query = query + self.api_key = api_key + self.search_kwargs = search_kwargs or {} + + def load(self) -> List[Document]: + brave_client = BraveSearchWrapper( + api_key=self.api_key, + search_kwargs=self.search_kwargs, + ) + return brave_client.download_documents(self.query) + + def lazy_load(self) -> Iterator[Document]: + for doc in self.load(): + yield doc diff --git a/langchain/utilities/brave_search.py b/langchain/utilities/brave_search.py index 8210e4082e..30d5994c70 100644 --- a/langchain/utilities/brave_search.py +++ b/langchain/utilities/brave_search.py @@ -1,40 +1,68 @@ import json +from typing import List import requests from pydantic import BaseModel, Field +from langchain.schema import Document + class BraveSearchWrapper(BaseModel): api_key: str search_kwargs: dict = Field(default_factory=dict) + base_url = "https://api.search.brave.com/res/v1/web/search" def run(self, query: str) -> str: + """Query the Brave search engine and return the results as a JSON string. + + Args: + query: The query to search for. + + Returns: The results as a JSON string. + + """ + web_search_results = self._search_request(query=query) + final_results = [ + { + "title": item.get("title"), + "link": item.get("url"), + "snippet": item.get("description"), + } + for item in web_search_results + ] + return json.dumps(final_results) + + def download_documents(self, query: str) -> List[Document]: + """Query the Brave search engine and return the results as a list of Documents. + + Args: + query: The query to search for. + + Returns: The results as a list of Documents. + + """ + results = self._search_request(query) + return [ + Document( + page_content=item.get("description"), + metadata={"title": item.get("title"), "link": item.get("url")}, + ) + for item in results + ] + + def _search_request(self, query: str) -> List[dict]: headers = { "X-Subscription-Token": self.api_key, "Accept": "application/json", } - base_url = "https://api.search.brave.com/res/v1/web/search" req = requests.PreparedRequest() params = {**self.search_kwargs, **{"q": query}} - req.prepare_url(base_url, params) + req.prepare_url(self.base_url, params) if req.url is None: raise ValueError("prepared url is None, this should not happen") response = requests.get(req.url, headers=headers) - if not response.ok: raise Exception(f"HTTP error {response.status_code}") - parsed_response = response.json() - web_search_results = parsed_response.get("web", {}).get("results", []) - final_results = [] - if isinstance(web_search_results, list): - for item in web_search_results: - final_results.append( - { - "title": item.get("title"), - "link": item.get("url"), - "snippet": item.get("description"), - } - ) - return json.dumps(final_results) + return response.json().get("web", {}).get("results", [])