mirror of
https://github.com/hwchase17/langchain
synced 2024-11-06 03:20:49 +00:00
added Brave Search
document_loader (#6989)
- Added `Brave Search` document loader. - Refactored BraveSearch wrapper - Added a Jupyter Notebook example - Added `Ecosystem/Integrations` BraveSearch page Please review: - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsev
This commit is contained in:
parent
6d15854cda
commit
200be43da6
36
docs/extras/ecosystem/integrations/brave_search.mdx
Normal file
36
docs/extras/ecosystem/integrations/brave_search.mdx
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
# Brave Search
|
||||||
|
|
||||||
|
|
||||||
|
>[Brave Search](https://en.wikipedia.org/wiki/Brave_Search) is a search engine developed by Brave Software.
|
||||||
|
> - `Brave Search` uses its own web index. As of May 2022, it covered over 10 billion pages and was used to serve 92%
|
||||||
|
> of search results without relying on any third-parties, with the remainder being retrieved
|
||||||
|
> server-side from the Bing API or (on an opt-in basis) client-side from Google. According
|
||||||
|
> to Brave, the index was kept "intentionally smaller than that of Google or Bing" in order to
|
||||||
|
> help avoid spam and other low-quality content, with the disadvantage that "Brave Search is
|
||||||
|
> not yet as good as Google in recovering long-tail queries."
|
||||||
|
>- `Brave Search Premium`: As of April 2023 Brave Search is an ad-free website, but it will
|
||||||
|
> eventually switch to a new model that will include ads and premium users will get an ad-free experience.
|
||||||
|
> User data including IP addresses won't be collected from its users by default. A premium account
|
||||||
|
> will be required for opt-in data-collection.
|
||||||
|
|
||||||
|
|
||||||
|
## Installation and Setup
|
||||||
|
|
||||||
|
To get access to the Brave Search API, you need to [create an account and get an API key](https://api.search.brave.com/app/dashboard).
|
||||||
|
|
||||||
|
|
||||||
|
## Document Loader
|
||||||
|
|
||||||
|
See a [usage example](/docs/modules/data_connection/document_loaders/integrations/brave_search.html).
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain.document_loaders import BraveSearchLoader
|
||||||
|
```
|
||||||
|
|
||||||
|
## Tool
|
||||||
|
|
||||||
|
See a [usage example](/docs/modules/agents/tools/integrations/brave_search.html).
|
||||||
|
|
||||||
|
```python
|
||||||
|
from langchain.tools import BraveSearch
|
||||||
|
```
|
@ -12,7 +12,7 @@
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
"cell_type": "code",
|
"cell_type": "code",
|
||||||
"execution_count": 1,
|
"execution_count": null,
|
||||||
"id": "a4c896e5",
|
"id": "a4c896e5",
|
||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
@ -27,7 +27,7 @@
|
|||||||
"metadata": {},
|
"metadata": {},
|
||||||
"outputs": [],
|
"outputs": [],
|
||||||
"source": [
|
"source": [
|
||||||
"api_key = \"...\""
|
"api_key = \"BSAv1neIuQOsxqOyy0sEe_ie2zD_n_V\""
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -49,7 +49,7 @@
|
|||||||
{
|
{
|
||||||
"data": {
|
"data": {
|
||||||
"text/plain": [
|
"text/plain": [
|
||||||
"'[{\"title\": \"Barack Obama - Wikipedia\", \"link\": \"https://en.wikipedia.org/wiki/Barack_Obama\", \"snippet\": \"Outside of politics, <strong>Obama</strong> has published three bestselling books: Dreams from My Father (1995), The Audacity of Hope (2006) and A Promised Land (2020). Rankings by scholars and historians, in which he has been featured since 2010, place him in the <strong>middle</strong> to upper tier of American presidents.\"}, {\"title\": \"Obama\\'s Middle Name -- My Last Name -- is \\'Hussein.\\' So?\", \"link\": \"https://www.cair.com/cair_in_the_news/obamas-middle-name-my-last-name-is-hussein-so/\", \"snippet\": \"Many Americans understand that common names don\\\\u2019t only come in the form of a \\\\u201cSmith\\\\u201d or a \\\\u201cJohnson.\\\\u201d Perhaps, they have a neighbor, mechanic or teacher named Hussein. Or maybe they\\\\u2019ve seen fashion designer Hussein Chalayan in the pages of Vogue or recall <strong>King Hussein</strong>, our ally in the Middle East.\"}, {\"title\": \"What\\'s up with Obama\\'s middle name? - Quora\", \"link\": \"https://www.quora.com/Whats-up-with-Obamas-middle-name\", \"snippet\": \"Answer (1 of 15): A better question would be, \\\\u201cWhat\\\\u2019s up with Obama\\\\u2019s first name?\\\\u201d President <strong>Barack Hussein Obama</strong>\\\\u2019s father\\\\u2019s name was <strong>Barack Hussein Obama</strong>. He was named after his father. Hussein, Obama\\\\u2019s middle name, is a very common Arabic name, meaning "good," "handsome," or "beautiful."\"}]'"
|
"'[{\"title\": \"Obama\\'s Middle Name -- My Last Name -- is \\'Hussein.\\' So?\", \"link\": \"https://www.cair.com/cair_in_the_news/obamas-middle-name-my-last-name-is-hussein-so/\", \"snippet\": \"I wasn\\\\u2019t sure whether to laugh or cry a few days back listening to radio talk show host Bill Cunningham repeatedly scream Barack <strong>Obama</strong>\\\\u2019<strong>s</strong> <strong>middle</strong> <strong>name</strong> \\\\u2014 my last <strong>name</strong> \\\\u2014 as if he had anti-Muslim Tourette\\\\u2019s. \\\\u201cHussein,\\\\u201d Cunningham hissed like he was beckoning Satan when shouting the ...\"}, {\"title\": \"What\\'s up with Obama\\'s middle name? - Quora\", \"link\": \"https://www.quora.com/Whats-up-with-Obamas-middle-name\", \"snippet\": \"Answer (1 of 15): A better question would be, \\\\u201cWhat\\\\u2019s up with <strong>Obama</strong>\\\\u2019s first <strong>name</strong>?\\\\u201d President Barack Hussein <strong>Obama</strong>\\\\u2019s father\\\\u2019s <strong>name</strong> was Barack Hussein <strong>Obama</strong>. He was <strong>named</strong> after his father. Hussein, <strong>Obama</strong>\\\\u2019<strong>s</strong> <strong>middle</strong> <strong>name</strong>, is a very common Arabic <strong>name</strong>, meaning "good," "handsome," or ...\"}, {\"title\": \"Barack Obama | Biography, Parents, Education, Presidency, Books, ...\", \"link\": \"https://www.britannica.com/biography/Barack-Obama\", \"snippet\": \"Barack <strong>Obama</strong>, in full Barack Hussein <strong>Obama</strong> II, (born August 4, 1961, Honolulu, Hawaii, U.S.), 44th president of the United States (2009\\\\u201317) and the first African American to hold the office. Before winning the presidency, <strong>Obama</strong> represented Illinois in the U.S.\"}]'"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
"execution_count": 4,
|
"execution_count": 4,
|
||||||
@ -86,7 +86,7 @@
|
|||||||
"name": "python",
|
"name": "python",
|
||||||
"nbconvert_exporter": "python",
|
"nbconvert_exporter": "python",
|
||||||
"pygments_lexer": "ipython3",
|
"pygments_lexer": "ipython3",
|
||||||
"version": "3.9.1"
|
"version": "3.10.6"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"nbformat": 4,
|
"nbformat": 4,
|
||||||
|
@ -0,0 +1,164 @@
|
|||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "3dd292b1-9a73-4ea8-af19-5fa6e3c1a62a",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"# Brave Search\n",
|
||||||
|
"\n",
|
||||||
|
"\n",
|
||||||
|
">[Brave Search](https://en.wikipedia.org/wiki/Brave_Search) is a search engine developed by Brave Software.\n",
|
||||||
|
"> - `Brave Search` uses its own web index. As of May 2022, it covered over 10 billion pages and was used to serve 92% \n",
|
||||||
|
"> of search results without relying on any third-parties, with the remainder being retrieved \n",
|
||||||
|
"> server-side from the Bing API or (on an opt-in basis) client-side from Google. According \n",
|
||||||
|
"> to Brave, the index was kept \"intentionally smaller than that of Google or Bing\" in order to \n",
|
||||||
|
"> help avoid spam and other low-quality content, with the disadvantage that \"Brave Search is \n",
|
||||||
|
"> not yet as good as Google in recovering long-tail queries.\"\n",
|
||||||
|
">- `Brave Search Premium`: As of April 2023 Brave Search is an ad-free website, but it will \n",
|
||||||
|
"> eventually switch to a new model that will include ads and premium users will get an ad-free experience.\n",
|
||||||
|
"> User data including IP addresses won't be collected from its users by default. A premium account \n",
|
||||||
|
"> will be required for opt-in data-collection.\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "26f0888e-3f3e-4b82-ac4a-2df6feeccbe0",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Installation and Setup\n",
|
||||||
|
"\n",
|
||||||
|
"To get access to the Brave Search API, you need to [create an account and get an API key](https://api.search.brave.com/app/dashboard).\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 12,
|
||||||
|
"id": "d7d7be09-58bd-47d7-bf1b-33964564f777",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"api_key = \"...\""
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "b3ac92df-6ff0-4dbb-b32b-a7dc140c48ef",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"from langchain.document_loaders import BraveSearchLoader"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "markdown",
|
||||||
|
"id": "7f483caf-58ef-4138-975a-5b783559dc1b",
|
||||||
|
"metadata": {},
|
||||||
|
"source": [
|
||||||
|
"## Example"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"id": "766634cf-3bc7-4656-939a-cafa218807a6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"3"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"loader = BraveSearchLoader(query=\"obama middle name\", api_key=api_key, search_kwargs={\"count\": 3})\n",
|
||||||
|
"docs = loader.load()\n",
|
||||||
|
"len(docs)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 11,
|
||||||
|
"id": "f1fcc9f1-cbdc-46b3-89d3-80311d557dc6",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"[{'title': \"Obama's Middle Name -- My Last Name -- is 'Hussein.' So?\",\n",
|
||||||
|
" 'link': 'https://www.cair.com/cair_in_the_news/obamas-middle-name-my-last-name-is-hussein-so/'},\n",
|
||||||
|
" {'title': \"What's up with Obama's middle name? - Quora\",\n",
|
||||||
|
" 'link': 'https://www.quora.com/Whats-up-with-Obamas-middle-name'},\n",
|
||||||
|
" {'title': 'Barack Obama | Biography, Parents, Education, Presidency, Books, ...',\n",
|
||||||
|
" 'link': 'https://www.britannica.com/biography/Barack-Obama'}]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 11,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"[doc.metadata for doc in docs]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 10,
|
||||||
|
"id": "601bfd77-03d3-468e-843f-2523d5e215bd",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/plain": [
|
||||||
|
"['I wasn’t sure whether to laugh or cry a few days back listening to radio talk show host Bill Cunningham repeatedly scream Barack <strong>Obama</strong>’<strong>s</strong> <strong>middle</strong> <strong>name</strong> — my last <strong>name</strong> — as if he had anti-Muslim Tourette’s. “Hussein,” Cunningham hissed like he was beckoning Satan when shouting the ...',\n",
|
||||||
|
" 'Answer (1 of 15): A better question would be, “What’s up with <strong>Obama</strong>’s first <strong>name</strong>?” President Barack Hussein <strong>Obama</strong>’s father’s <strong>name</strong> was Barack Hussein <strong>Obama</strong>. He was <strong>named</strong> after his father. Hussein, <strong>Obama</strong>’<strong>s</strong> <strong>middle</strong> <strong>name</strong>, is a very common Arabic <strong>name</strong>, meaning "good," "handsome," or ...',\n",
|
||||||
|
" 'Barack <strong>Obama</strong>, in full Barack Hussein <strong>Obama</strong> II, (born August 4, 1961, Honolulu, Hawaii, U.S.), 44th president of the United States (2009–17) and the first African American to hold the office. Before winning the presidency, <strong>Obama</strong> represented Illinois in the U.S.']"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"execution_count": 10,
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "execute_result"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"[doc.page_content for doc in docs]"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"id": "74a6ba54-9e48-4bac-ab9b-03eabd19eb81",
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "Python 3 (ipykernel)",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.10.6"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 5
|
||||||
|
}
|
@ -23,6 +23,7 @@ from langchain.document_loaders.blob_loaders import (
|
|||||||
YoutubeAudioLoader,
|
YoutubeAudioLoader,
|
||||||
)
|
)
|
||||||
from langchain.document_loaders.blockchain import BlockchainDocumentLoader
|
from langchain.document_loaders.blockchain import BlockchainDocumentLoader
|
||||||
|
from langchain.document_loaders.brave_search import BraveSearchLoader
|
||||||
from langchain.document_loaders.chatgpt import ChatGPTLoader
|
from langchain.document_loaders.chatgpt import ChatGPTLoader
|
||||||
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
from langchain.document_loaders.college_confidential import CollegeConfidentialLoader
|
||||||
from langchain.document_loaders.confluence import ConfluenceLoader
|
from langchain.document_loaders.confluence import ConfluenceLoader
|
||||||
@ -168,6 +169,7 @@ __all__ = [
|
|||||||
"Blob",
|
"Blob",
|
||||||
"BlobLoader",
|
"BlobLoader",
|
||||||
"BlockchainDocumentLoader",
|
"BlockchainDocumentLoader",
|
||||||
|
"BraveSearchLoader",
|
||||||
"CSVLoader",
|
"CSVLoader",
|
||||||
"ChatGPTLoader",
|
"ChatGPTLoader",
|
||||||
"CoNLLULoader",
|
"CoNLLULoader",
|
||||||
|
32
langchain/document_loaders/brave_search.py
Normal file
32
langchain/document_loaders/brave_search.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
from typing import Iterator, List, Optional
|
||||||
|
|
||||||
|
from langchain.docstore.document import Document
|
||||||
|
from langchain.document_loaders.base import BaseLoader
|
||||||
|
from langchain.utilities.brave_search import BraveSearchWrapper
|
||||||
|
|
||||||
|
|
||||||
|
class BraveSearchLoader(BaseLoader):
|
||||||
|
"""Loads a query result from Brave Search engine into a list of Documents."""
|
||||||
|
|
||||||
|
def __init__(self, query: str, api_key: str, search_kwargs: Optional[dict] = None):
|
||||||
|
"""Initializes the BraveLoader.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: The query to search for.
|
||||||
|
api_key: The API key to use.
|
||||||
|
search_kwargs: The search kwargs to use.
|
||||||
|
"""
|
||||||
|
self.query = query
|
||||||
|
self.api_key = api_key
|
||||||
|
self.search_kwargs = search_kwargs or {}
|
||||||
|
|
||||||
|
def load(self) -> List[Document]:
|
||||||
|
brave_client = BraveSearchWrapper(
|
||||||
|
api_key=self.api_key,
|
||||||
|
search_kwargs=self.search_kwargs,
|
||||||
|
)
|
||||||
|
return brave_client.download_documents(self.query)
|
||||||
|
|
||||||
|
def lazy_load(self) -> Iterator[Document]:
|
||||||
|
for doc in self.load():
|
||||||
|
yield doc
|
@ -1,40 +1,68 @@
|
|||||||
import json
|
import json
|
||||||
|
from typing import List
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
from langchain.schema import Document
|
||||||
|
|
||||||
|
|
||||||
class BraveSearchWrapper(BaseModel):
|
class BraveSearchWrapper(BaseModel):
|
||||||
api_key: str
|
api_key: str
|
||||||
search_kwargs: dict = Field(default_factory=dict)
|
search_kwargs: dict = Field(default_factory=dict)
|
||||||
|
base_url = "https://api.search.brave.com/res/v1/web/search"
|
||||||
|
|
||||||
def run(self, query: str) -> str:
|
def run(self, query: str) -> str:
|
||||||
|
"""Query the Brave search engine and return the results as a JSON string.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: The query to search for.
|
||||||
|
|
||||||
|
Returns: The results as a JSON string.
|
||||||
|
|
||||||
|
"""
|
||||||
|
web_search_results = self._search_request(query=query)
|
||||||
|
final_results = [
|
||||||
|
{
|
||||||
|
"title": item.get("title"),
|
||||||
|
"link": item.get("url"),
|
||||||
|
"snippet": item.get("description"),
|
||||||
|
}
|
||||||
|
for item in web_search_results
|
||||||
|
]
|
||||||
|
return json.dumps(final_results)
|
||||||
|
|
||||||
|
def download_documents(self, query: str) -> List[Document]:
|
||||||
|
"""Query the Brave search engine and return the results as a list of Documents.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
query: The query to search for.
|
||||||
|
|
||||||
|
Returns: The results as a list of Documents.
|
||||||
|
|
||||||
|
"""
|
||||||
|
results = self._search_request(query)
|
||||||
|
return [
|
||||||
|
Document(
|
||||||
|
page_content=item.get("description"),
|
||||||
|
metadata={"title": item.get("title"), "link": item.get("url")},
|
||||||
|
)
|
||||||
|
for item in results
|
||||||
|
]
|
||||||
|
|
||||||
|
def _search_request(self, query: str) -> List[dict]:
|
||||||
headers = {
|
headers = {
|
||||||
"X-Subscription-Token": self.api_key,
|
"X-Subscription-Token": self.api_key,
|
||||||
"Accept": "application/json",
|
"Accept": "application/json",
|
||||||
}
|
}
|
||||||
base_url = "https://api.search.brave.com/res/v1/web/search"
|
|
||||||
req = requests.PreparedRequest()
|
req = requests.PreparedRequest()
|
||||||
params = {**self.search_kwargs, **{"q": query}}
|
params = {**self.search_kwargs, **{"q": query}}
|
||||||
req.prepare_url(base_url, params)
|
req.prepare_url(self.base_url, params)
|
||||||
if req.url is None:
|
if req.url is None:
|
||||||
raise ValueError("prepared url is None, this should not happen")
|
raise ValueError("prepared url is None, this should not happen")
|
||||||
|
|
||||||
response = requests.get(req.url, headers=headers)
|
response = requests.get(req.url, headers=headers)
|
||||||
|
|
||||||
if not response.ok:
|
if not response.ok:
|
||||||
raise Exception(f"HTTP error {response.status_code}")
|
raise Exception(f"HTTP error {response.status_code}")
|
||||||
|
|
||||||
parsed_response = response.json()
|
return response.json().get("web", {}).get("results", [])
|
||||||
web_search_results = parsed_response.get("web", {}).get("results", [])
|
|
||||||
final_results = []
|
|
||||||
if isinstance(web_search_results, list):
|
|
||||||
for item in web_search_results:
|
|
||||||
final_results.append(
|
|
||||||
{
|
|
||||||
"title": item.get("title"),
|
|
||||||
"link": item.get("url"),
|
|
||||||
"snippet": item.get("description"),
|
|
||||||
}
|
|
||||||
)
|
|
||||||
return json.dumps(final_results)
|
|
||||||
|
Loading…
Reference in New Issue
Block a user