mirror of https://github.com/hwchase17/langchain
added `Brave Search` document_loader (#6989)
- Added `Brave Search` document loader. - Refactored BraveSearch wrapper - Added a Jupyter Notebook example - Added `Ecosystem/Integrations` BraveSearch page Please review: - DataLoaders / VectorStores / Retrievers: @rlancemartin, @eyurtsevpull/6440/head
parent
6d15854cda
commit
200be43da6
@ -0,0 +1,36 @@
|
||||
# Brave Search
|
||||
|
||||
|
||||
>[Brave Search](https://en.wikipedia.org/wiki/Brave_Search) is a search engine developed by Brave Software.
|
||||
> - `Brave Search` uses its own web index. As of May 2022, it covered over 10 billion pages and was used to serve 92%
|
||||
> of search results without relying on any third-parties, with the remainder being retrieved
|
||||
> server-side from the Bing API or (on an opt-in basis) client-side from Google. According
|
||||
> to Brave, the index was kept "intentionally smaller than that of Google or Bing" in order to
|
||||
> help avoid spam and other low-quality content, with the disadvantage that "Brave Search is
|
||||
> not yet as good as Google in recovering long-tail queries."
|
||||
>- `Brave Search Premium`: As of April 2023 Brave Search is an ad-free website, but it will
|
||||
> eventually switch to a new model that will include ads and premium users will get an ad-free experience.
|
||||
> User data including IP addresses won't be collected from its users by default. A premium account
|
||||
> will be required for opt-in data-collection.
|
||||
|
||||
|
||||
## Installation and Setup
|
||||
|
||||
To get access to the Brave Search API, you need to [create an account and get an API key](https://api.search.brave.com/app/dashboard).
|
||||
|
||||
|
||||
## Document Loader
|
||||
|
||||
See a [usage example](/docs/modules/data_connection/document_loaders/integrations/brave_search.html).
|
||||
|
||||
```python
|
||||
from langchain.document_loaders import BraveSearchLoader
|
||||
```
|
||||
|
||||
## Tool
|
||||
|
||||
See a [usage example](/docs/modules/agents/tools/integrations/brave_search.html).
|
||||
|
||||
```python
|
||||
from langchain.tools import BraveSearch
|
||||
```
|
@ -0,0 +1,32 @@
|
||||
from typing import Iterator, List, Optional
|
||||
|
||||
from langchain.docstore.document import Document
|
||||
from langchain.document_loaders.base import BaseLoader
|
||||
from langchain.utilities.brave_search import BraveSearchWrapper
|
||||
|
||||
|
||||
class BraveSearchLoader(BaseLoader):
|
||||
"""Loads a query result from Brave Search engine into a list of Documents."""
|
||||
|
||||
def __init__(self, query: str, api_key: str, search_kwargs: Optional[dict] = None):
|
||||
"""Initializes the BraveLoader.
|
||||
|
||||
Args:
|
||||
query: The query to search for.
|
||||
api_key: The API key to use.
|
||||
search_kwargs: The search kwargs to use.
|
||||
"""
|
||||
self.query = query
|
||||
self.api_key = api_key
|
||||
self.search_kwargs = search_kwargs or {}
|
||||
|
||||
def load(self) -> List[Document]:
|
||||
brave_client = BraveSearchWrapper(
|
||||
api_key=self.api_key,
|
||||
search_kwargs=self.search_kwargs,
|
||||
)
|
||||
return brave_client.download_documents(self.query)
|
||||
|
||||
def lazy_load(self) -> Iterator[Document]:
|
||||
for doc in self.load():
|
||||
yield doc
|
@ -1,40 +1,68 @@
|
||||
import json
|
||||
from typing import List
|
||||
|
||||
import requests
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
from langchain.schema import Document
|
||||
|
||||
|
||||
class BraveSearchWrapper(BaseModel):
|
||||
api_key: str
|
||||
search_kwargs: dict = Field(default_factory=dict)
|
||||
base_url = "https://api.search.brave.com/res/v1/web/search"
|
||||
|
||||
def run(self, query: str) -> str:
|
||||
"""Query the Brave search engine and return the results as a JSON string.
|
||||
|
||||
Args:
|
||||
query: The query to search for.
|
||||
|
||||
Returns: The results as a JSON string.
|
||||
|
||||
"""
|
||||
web_search_results = self._search_request(query=query)
|
||||
final_results = [
|
||||
{
|
||||
"title": item.get("title"),
|
||||
"link": item.get("url"),
|
||||
"snippet": item.get("description"),
|
||||
}
|
||||
for item in web_search_results
|
||||
]
|
||||
return json.dumps(final_results)
|
||||
|
||||
def download_documents(self, query: str) -> List[Document]:
|
||||
"""Query the Brave search engine and return the results as a list of Documents.
|
||||
|
||||
Args:
|
||||
query: The query to search for.
|
||||
|
||||
Returns: The results as a list of Documents.
|
||||
|
||||
"""
|
||||
results = self._search_request(query)
|
||||
return [
|
||||
Document(
|
||||
page_content=item.get("description"),
|
||||
metadata={"title": item.get("title"), "link": item.get("url")},
|
||||
)
|
||||
for item in results
|
||||
]
|
||||
|
||||
def _search_request(self, query: str) -> List[dict]:
|
||||
headers = {
|
||||
"X-Subscription-Token": self.api_key,
|
||||
"Accept": "application/json",
|
||||
}
|
||||
base_url = "https://api.search.brave.com/res/v1/web/search"
|
||||
req = requests.PreparedRequest()
|
||||
params = {**self.search_kwargs, **{"q": query}}
|
||||
req.prepare_url(base_url, params)
|
||||
req.prepare_url(self.base_url, params)
|
||||
if req.url is None:
|
||||
raise ValueError("prepared url is None, this should not happen")
|
||||
|
||||
response = requests.get(req.url, headers=headers)
|
||||
|
||||
if not response.ok:
|
||||
raise Exception(f"HTTP error {response.status_code}")
|
||||
|
||||
parsed_response = response.json()
|
||||
web_search_results = parsed_response.get("web", {}).get("results", [])
|
||||
final_results = []
|
||||
if isinstance(web_search_results, list):
|
||||
for item in web_search_results:
|
||||
final_results.append(
|
||||
{
|
||||
"title": item.get("title"),
|
||||
"link": item.get("url"),
|
||||
"snippet": item.get("description"),
|
||||
}
|
||||
)
|
||||
return json.dumps(final_results)
|
||||
return response.json().get("web", {}).get("results", [])
|
||||
|
Loading…
Reference in New Issue