mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
added Brave Search
document_loader (#6989)
- Added `Brave Search` document loader. - Refactored BraveSearch wrapper - Added a Jupyter Notebook example - Added `Ecosystem/Integrations` BraveSearch page Please review: - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev
This commit is contained in:
parent
6d15854cda
commit
200be43da6
36
docs/extras/ecosystem/integrations/brave_search.mdx
Normal file
36
docs/extras/ecosystem/integrations/brave_search.mdx
Normal file
@ -0,0 +1,36 @@
|
||||
# Brave Search
|
||||
|
||||
|
||||
>[Brave Search](https://en.wikipedia.org/wiki/Brave_Search) is a search engine developed by Brave Software.
|
||||
> - `Brave Search` uses its own web index. As of May 2022, it covered over 10 billion pages and was used to serve 92%
|
||||
> of search results without relying on any third-parties, with the remainder being retrieved
|
||||
> server-side from the Bing API or (on an opt-in basis) client-side from Google. According
|
||||
> to Brave, the index was kept "intentionally smaller than that of Google or Bing" in order to
|
||||
> help avoid spam and other low-quality content, with the disadvantage that "Brave Search is
|
||||
> not yet as good as Google in recovering long-tail queries."
|
||||
>- `Brave Search Premium`: As of April 2023 Brave Search is an ad-free website, but it will
|
||||
> eventually switch to a new model that will include ads and premium users will get an ad-free experience.
|
||||
> User data including IP addresses won't be collected from its users by default. A premium account
|
||||
> will be required for opt-in data-collection.
|
||||
|
||||
|
||||
## Installation and Setup
|
||||
|
||||
To get access to the Brave Search API, you need to [create an account and get an API key](https://api.search.brave.com/app/dashboard).
|
||||
|
||||
|
||||
## Document Loader
|
||||
|
||||
See a [usage example](/docs/modules/data_connection/document_loaders/integrations/brave_search.html).
|
||||
|
||||
```python
|
||||
from langchain.document_loaders import BraveSearchLoader
|
||||
```
|
||||
|
||||
## Tool
|
||||
|
||||
See a [usage example](/docs/modules/agents/tools/integrations/brave_search.html).
|
||||
|
||||
```python
|
||||
from langchain.tools import BraveSearch
|
||||
```
|
@ -12,7 +12,7 @@
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"execution_count": null,
|
||||
"id": "a4c896e5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
@ -27,7 +27,7 @@
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"api_key = \"...\""
|
||||
"api_key = \"BSAv1neIuQOsxqOyy0sEe_ie2zD_n_V\""
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -49,7 +49,7 @@
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'[{\"title\": \"Barack Obama - Wikipedia\", \"link\": \"https://en.wikipedia.org/wiki/Barack_Obama\", \"snippet\": \"Outside of politics, <strong>Obama</strong> has published three bestselling books: Dreams from My Father (1995), The Audacity of Hope (2006) and A Promised Land (2020). Rankings by scholars and historians, in which he has been featured since 2010, place him in the <strong>middle</strong> to upper tier of American presidents.\"}, {\"title\": \"Obama\\'s Middle Name -- My Last Name -- is \\'Hussein.\\' So?\", \"link\": \"https://www.cair.com/cair_in_the_news/obamas-middle-name-my-last-name-is-hussein-so/\", \"snippet\": \"Many Americans understand that common names don\\\\u2019t only come in the form of a \\\\u201cSmith\\\\u201d or a \\\\u201cJohnson.\\\\u201d Perhaps, they have a neighbor, mechanic or teacher named Hussein. Or maybe they\\\\u2019ve seen fashion designer Hussein Chalayan in the pages of Vogue or recall <strong>King Hussein</strong>, our ally in the Middle East.\"}, {\"title\": \"What\\'s up with Obama\\'s middle name? - Quora\", \"link\": \"https://www.quora.com/Whats-up-with-Obamas-middle-name\", \"snippet\": \"Answer (1 of 15): A better question would be, \\\\u201cWhat\\\\u2019s up with Obama\\\\u2019s first name?\\\\u201d President <strong>Barack Hussein Obama</strong>\\\\u2019s father\\\\u2019s name was <strong>Barack Hussein Obama</strong>. He was named after his father. Hussein, Obama\\\\u2019s middle name, is a very common Arabic name, meaning "good," "handsome," or "beautiful."\"}]'"
|
||||
"'[{\"title\": \"Obama\\'s Middle Name -- My Last Name -- is \\'Hussein.\\' So?\", \"link\": \"https://www.cair.com/cair_in_the_news/obamas-middle-name-my-last-name-is-hussein-so/\", \"snippet\": \"I wasn\\\\u2019t sure whether to laugh or cry a few days back listening to radio talk show host Bill Cunningham repeatedly scream Barack <strong>Obama</strong>\\\\u2019<strong>s</strong> <strong>middle</strong> <strong>name</strong> \\\\u2014 my last <strong>name</strong> \\\\u2014 as if he had anti-Muslim Tourette\\\\u2019s. \\\\u201cHussein,\\\\u201d Cunningham hissed like he was beckoning Satan when shouting the ...\"}, {\"title\": \"What\\'s up with Obama\\'s middle name? - Quora\", \"link\": \"https://www.quora.com/Whats-up-with-Obamas-middle-name\", \"snippet\": \"Answer (1 of 15): A better question would be, \\\\u201cWhat\\\\u2019s up with <strong>Obama</strong>\\\\u2019s first <strong>name</strong>?\\\\u201d President Barack Hussein <strong>Obama</strong>\\\\u2019s father\\\\u2019s <strong>name</strong> was Barack Hussein <strong>Obama</strong>. He was <strong>named</strong> after his father. Hussein, <strong>Obama</strong>\\\\u2019<strong>s</strong> <strong>middle</strong> <strong>name</strong>, is a very common Arabic <strong>name</strong>, meaning "good," "handsome," or ...\"}, {\"title\": \"Barack Obama | Biography, Parents, Education, Presidency, Books, ...\", \"link\": \"https://www.britannica.com/biography/Barack-Obama\", \"snippet\": \"Barack <strong>Obama</strong>, in full Barack Hussein <strong>Obama</strong> II, (born August 4, 1961, Honolulu, Hawaii, U.S.), 44th president of the United States (2009\\\\u201317) and the first African American to hold the office. Before winning the presidency, <strong>Obama</strong> represented Illinois in the U.S.\"}]'"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
@ -86,7 +86,7 @@
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.1"
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
|
@ -0,0 +1,164 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3dd292b1-9a73-4ea8-af19-5fa6e3c1a62a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Brave Search\n",
|
||||
"\n",
|
||||
"\n",
|
||||
">[Brave Search](https://en.wikipedia.org/wiki/Brave_Search) is a search engine developed by Brave Software.\n",
|
||||
"> - `Brave Search` uses its own web index. As of May 2022, it covered over 10 billion pages and was used to serve 92% \n",
|
||||
"> of search results without relying on any third-parties, with the remainder being retrieved \n",
|
||||
"> server-side from the Bing API or (on an opt-in basis) client-side from Google. According \n",
|
||||
"> to Brave, the index was kept \"intentionally smaller than that of Google or Bing\" in order to \n",
|
||||
"> help avoid spam and other low-quality content, with the disadvantage that \"Brave Search is \n",
|
||||
"> not yet as good as Google in recovering long-tail queries.\"\n",
|
||||
">- `Brave Search Premium`: As of April 2023 Brave Search is an ad-free website, but it will \n",
|
||||
"> eventually switch to a new model that will include ads and premium users will get an ad-free experience.\n",
|
||||
"> User data including IP addresses won't be collected from its users by default. A premium account \n",
|
||||
"> will be required for opt-in data-collection.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "26f0888e-3f3e-4b82-ac4a-2df6feeccbe0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Installation and Setup\n",
|
||||
"\n",
|
||||
"To get access to the Brave Search API, you need to [create an account and get an API key](https://api.search.brave.com/app/dashboard).\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "d7d7be09-58bd-47d7-bf1b-33964564f777",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"api_key = \"...\""
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "b3ac92df-6ff0-4dbb-b32b-a7dc140c48ef",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from langchain.document_loaders import BraveSearchLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7f483caf-58ef-4138-975a-5b783559dc1b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Example"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "766634cf-3bc7-4656-939a-cafa218807a6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"3"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"loader = BraveSearchLoader(query=\"obama middle name\", api_key=api_key, search_kwargs={\"count\": 3})\n",
|
||||
"docs = loader.load()\n",
|
||||
"len(docs)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "f1fcc9f1-cbdc-46b3-89d3-80311d557dc6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'title': \"Obama's Middle Name -- My Last Name -- is 'Hussein.' So?\",\n",
|
||||
" 'link': 'https://www.cair.com/cair_in_the_news/obamas-middle-name-my-last-name-is-hussein-so/'},\n",
|
||||
" {'title': \"What's up with Obama's middle name? - Quora\",\n",
|
||||
" 'link': 'https://www.quora.com/Whats-up-with-Obamas-middle-name'},\n",
|
||||
" {'title': 'Barack Obama | Biography, Parents, Education, Presidency, Books, ...',\n",
|
||||
" 'link': 'https://www.britannica.com/biography/Barack-Obama'}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"[doc.metadata for doc in docs]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "601bfd77-03d3-468e-843f-2523d5e215bd",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"['I wasn’t sure whether to laugh or cry a few days back listening to radio talk show host Bill Cunningham repeatedly scream Barack <strong>Obama</strong>’<strong>s</strong> <strong>middle</strong> <strong>name</strong> — my last <strong>name</strong> — as if he had anti-Muslim Tourette’s. “Hussein,” Cunningham hissed like he was beckoning Satan when shouting the ...',\n",
|
||||
" 'Answer (1 of 15): A better question would be, “What’s up with <strong>Obama</strong>’s first <strong>name</strong>?” President Barack Hussein <strong>Obama</strong>’s father’s <strong>name</strong> was Barack Hussein <strong>Obama</strong>. He was <strong>named</strong> after his father. Hussein, <strong>Obama</strong>’<strong>s</strong> <strong>middle</strong> <strong>name</strong>, is a very common Arabic <strong>name</strong>, meaning "good," "handsome," or ...',\n",
|
||||
" 'Barack <strong>Obama</strong>, in full Barack Hussein <strong>Obama</strong> II, (born August 4, 1961, Honolulu, Hawaii, U.S.), 44th president of the United States (2009–17) and the first African American to hold the office. Before winning the presidency, <strong>Obama</strong> represented Illinois in the U.S.']"
|
||||
]
|
||||
},
|
||||
"execution_count": 10,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"[doc.page_content for doc in docs]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "74a6ba54-9e48-4bac-ab9b-03eabd19eb81",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.6"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -23,6 +23,7 @@ from langchain.document_loaders.blob_loaders import (
|
||||
YoutubeAudioLoader,
|
||||
)
|
||||
from langchain.document_loaders.blockchain import BlockchainDocumentLoader
|
||||
from langchain.document_loaders.brave_search import BraveSearchLoader
|
||||
from langchain.document_loaders.chatgpt import ChatGPTLoader
|
||||
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
||||
from langchain.document_loaders.confluence import ConfluenceLoader
|
||||
@ -168,6 +169,7 @@ __all__ = [
|
||||
"Blob",
|
||||
"BlobLoader",
|
||||
"BlockchainDocumentLoader",
|
||||
"BraveSearchLoader",
|
||||
"CSVLoader",
|
||||
"ChatGPTLoader",
|
||||
"CoNLLULoader",
|
||||
|
32
langchain/document_loaders/brave_search.py
Normal file
32
langchain/document_loaders/brave_search.py
Normal file
@ -0,0 +1,32 @@
|
||||
from typing import Iterator, List, Optional
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.utilities.brave_search import BraveSearchWrapper
|
||||
|
||||
|
||||
class BraveSearchLoader(BaseLoader):
|
||||
"""Loads a query result from Brave Search engine into a list of Documents."""
|
||||
|
||||
def __init__(self, query: str, api_key: str, search_kwargs: Optional[dict] = None):
|
||||
"""Initializes the BraveLoader.
|
||||
|
||||
Args:
|
||||
query: The query to search for.
|
||||
api_key: The API key to use.
|
||||
search_kwargs: The search kwargs to use.
|
||||
"""
|
||||
self.query = query
|
||||
self.api_key = api_key
|
||||
self.search_kwargs = search_kwargs or {}
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
brave_client = BraveSearchWrapper(
|
||||
api_key=self.api_key,
|
||||
search_kwargs=self.search_kwargs,
|
||||
)
|
||||
return brave_client.download_documents(self.query)
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
for doc in self.load():
|
||||
yield doc
|
@ -1,40 +1,68 @@
|
||||
import json
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from langchain.schema import Document
|
||||
|
||||
|
||||
class BraveSearchWrapper(BaseModel):
|
||||
api_key: str
|
||||
search_kwargs: dict = Field(default_factory=dict)
|
||||
base_url = "https://api.search.brave.com/res/v1/web/search"
|
||||
|
||||
def run(self, query: str) -> str:
|
||||
headers = {
|
||||
"X-Subscription-Token": self.api_key,
|
||||
"Accept": "application/json",
|
||||
}
|
||||
base_url = "https://api.search.brave.com/res/v1/web/search"
|
||||
req = requests.PreparedRequest()
|
||||
params = {**self.search_kwargs, **{"q": query}}
|
||||
req.prepare_url(base_url, params)
|
||||
if req.url is None:
|
||||
raise ValueError("prepared url is None, this should not happen")
|
||||
"""Query the Brave search engine and return the results as a JSON string.
|
||||
|
||||
response = requests.get(req.url, headers=headers)
|
||||
Args:
|
||||
query: The query to search for.
|
||||
|
||||
if not response.ok:
|
||||
raise Exception(f"HTTP error {response.status_code}")
|
||||
Returns: The results as a JSON string.
|
||||
|
||||
parsed_response = response.json()
|
||||
web_search_results = parsed_response.get("web", {}).get("results", [])
|
||||
final_results = []
|
||||
if isinstance(web_search_results, list):
|
||||
for item in web_search_results:
|
||||
final_results.append(
|
||||
"""
|
||||
web_search_results = self._search_request(query=query)
|
||||
final_results = [
|
||||
{
|
||||
"title": item.get("title"),
|
||||
"link": item.get("url"),
|
||||
"snippet": item.get("description"),
|
||||
}
|
||||
)
|
||||
for item in web_search_results
|
||||
]
|
||||
return json.dumps(final_results)
|
||||
|
||||
def download_documents(self, query: str) -> List[Document]:
|
||||
"""Query the Brave search engine and return the results as a list of Documents.
|
||||
|
||||
Args:
|
||||
query: The query to search for.
|
||||
|
||||
Returns: The results as a list of Documents.
|
||||
|
||||
"""
|
||||
results = self._search_request(query)
|
||||
return [
|
||||
Document(
|
||||
page_content=item.get("description"),
|
||||
metadata={"title": item.get("title"), "link": item.get("url")},
|
||||
)
|
||||
for item in results
|
||||
]
|
||||
|
||||
def _search_request(self, query: str) -> List[dict]:
|
||||
headers = {
|
||||
"X-Subscription-Token": self.api_key,
|
||||
"Accept": "application/json",
|
||||
}
|
||||
req = requests.PreparedRequest()
|
||||
params = {**self.search_kwargs, **{"q": query}}
|
||||
req.prepare_url(self.base_url, params)
|
||||
if req.url is None:
|
||||
raise ValueError("prepared url is None, this should not happen")
|
||||
|
||||
response = requests.get(req.url, headers=headers)
|
||||
if not response.ok:
|
||||
raise Exception(f"HTTP error {response.status_code}")
|
||||
|
||||
return response.json().get("web", {}).get("results", [])
|
||||
|
Loading…
Reference in New Issue
Block a user