diff --git a/docs/extras/ecosystem/integrations/dataforseo.mdx b/docs/extras/ecosystem/integrations/dataforseo.mdx new file mode 100644 index 0000000000..9bb9fb1c85 --- /dev/null +++ b/docs/extras/ecosystem/integrations/dataforseo.mdx @@ -0,0 +1,51 @@ +# DataForSEO + +This page provides instructions on how to use the DataForSEO search APIs within LangChain. + +## Installation and Setup + +- Get a DataForSEO API Access login and password, and set them as environment variables (`DATAFORSEO_LOGIN` and `DATAFORSEO_PASSWORD` respectively). You can find it in your dashboard. + +## Wrappers + +### Utility + +The DataForSEO utility wraps the API. To import this utility, use: + +```python +from langchain.utilities import DataForSeoAPIWrapper +``` + +For a detailed walkthrough of this wrapper, see [this notebook](/docs/modules/agents/tools/integrations/dataforseo.ipynb). + +### Tool + +You can also load this wrapper as a Tool to use with an Agent: + +```python +from langchain.agents import load_tools +tools = load_tools(["dataforseo-api-search"]) +``` + +## Example usage + +```python +dataforseo = DataForSeoAPIWrapper(api_login="your_login", api_password="your_password") +result = dataforseo.run("Bill Gates") +print(result) +``` + +## Environment Variables + +You can store your DataForSEO API Access login and password as environment variables. The wrapper will automatically check for these environment variables if no values are provided: + +```python +import os + +os.environ["DATAFORSEO_LOGIN"] = "your_login" +os.environ["DATAFORSEO_PASSWORD"] = "your_password" + +dataforseo = DataForSeoAPIWrapper() +result = dataforseo.run("weather in Los Angeles") +print(result) +``` diff --git a/docs/extras/modules/agents/tools/integrations/dataforseo.ipynb b/docs/extras/modules/agents/tools/integrations/dataforseo.ipynb new file mode 100644 index 0000000000..b1eb1fb1ac --- /dev/null +++ b/docs/extras/modules/agents/tools/integrations/dataforseo.ipynb @@ -0,0 +1,226 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# DataForSeo API Wrapper\n", + "This notebook demonstrates how to use the DataForSeo API wrapper to obtain search engine results. The DataForSeo API allows users to retrieve SERP from most popular search engines like Google, Bing, Yahoo. It also allows to get SERPs from different search engine types like Maps, News, Events, etc.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.utilities import DataForSeoAPIWrapper" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setting up the API wrapper with your credentials\n", + "You can obtain your API credentials by registering on the DataForSeo website." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"DATAFORSEO_LOGIN\"] = \"your_api_access_username\"\n", + "os.environ[\"DATAFORSEO_PASSWORD\"] = \"your_api_access_password\"\n", + "\n", + "wrapper = DataForSeoAPIWrapper()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The run method will return the first result snippet from one of the following elements: answer_box, knowledge_graph, featured_snippet, shopping, organic." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "wrapper.run(\"Weather in Los Angeles\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The Difference Between `run` and `results`\n", + "`run` and `results` are two methods provided by the `DataForSeoAPIWrapper` class.\n", + "\n", + "The `run` method executes the search and returns the first result snippet from the answer box, knowledge graph, featured snippet, shopping, or organic results. These elements are sorted by priority from highest to lowest.\n", + "\n", + "The `results` method returns a JSON response configured according to the parameters set in the wrapper. This allows for more flexibility in terms of what data you want to return from the API." + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Getting Results as JSON\n", + "You can customize the result types and fields you want to return in the JSON response. You can also set a maximum count for the number of top results to return." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "json_wrapper = DataForSeoAPIWrapper(\n", + " json_result_types=[\"organic\", \"knowledge_graph\", \"answer_box\"],\n", + " json_result_fields=[\"type\", \"title\", \"description\", \"text\"],\n", + " top_count=3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "json_wrapper.results(\"Bill Gates\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Customizing Location and Language\n", + "You can specify the location and language of your search results by passing additional parameters to the API wrapper." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "customized_wrapper = DataForSeoAPIWrapper(\n", + " top_count=10,\n", + " json_result_types=[\"organic\", \"local_pack\"],\n", + " json_result_fields=[\"title\", \"description\", \"type\"],\n", + " params={\"location_name\": \"Germany\", \"language_code\": \"en\"})\n", + "customized_wrapper.results(\"coffee near me\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Customizing the Search Engine\n", + "You can also specify the search engine you want to use." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "customized_wrapper = DataForSeoAPIWrapper(\n", + " top_count=10,\n", + " json_result_types=[\"organic\", \"local_pack\"],\n", + " json_result_fields=[\"title\", \"description\", \"type\"],\n", + " params={\"location_name\": \"Germany\", \"language_code\": \"en\", \"se_name\": \"bing\"})\n", + "customized_wrapper.results(\"coffee near me\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Customizing the Search Type\n", + "The API wrapper also allows you to specify the type of search you want to perform. For example, you can perform a maps search." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "maps_search = DataForSeoAPIWrapper(\n", + " top_count=10,\n", + " json_result_fields=[\"title\", \"value\", \"address\", \"rating\", \"type\"],\n", + " params={\"location_coordinate\": \"52.512,13.36,12z\", \"language_code\": \"en\", \"se_type\": \"maps\"})\n", + "maps_search.results(\"coffee near me\")" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Integration with Langchain Agents\n", + "You can use the `Tool` class from the `langchain.agents` module to integrate the `DataForSeoAPIWrapper` with a langchain agent. The `Tool` class encapsulates a function that the agent can call." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.agents import Tool\n", + "search = DataForSeoAPIWrapper(\n", + " top_count=3,\n", + " json_result_types=[\"organic\"],\n", + " json_result_fields=[\"title\", \"description\", \"type\"])\n", + "tool = Tool(\n", + " name=\"google-search-answer\",\n", + " description=\"My new answer tool\",\n", + " func=search.run,\n", + ")\n", + "json_tool = Tool(\n", + " name=\"google-search-json\",\n", + " description=\"My new json tool\",\n", + " func=search.results,\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/langchain/agents/load_tools.py b/langchain/agents/load_tools.py index f4a27bd696..2b7b08668d 100644 --- a/langchain/agents/load_tools.py +++ b/langchain/agents/load_tools.py @@ -38,6 +38,8 @@ from langchain.tools.sleep.tool import SleepTool from langchain.tools.wikipedia.tool import WikipediaQueryRun from langchain.tools.wolfram_alpha.tool import WolframAlphaQueryRun from langchain.tools.openweathermap.tool import OpenWeatherMapQueryRun +from langchain.tools.dataforseo_api_search import DataForSeoAPISearchRun +from langchain.tools.dataforseo_api_search import DataForSeoAPISearchResults from langchain.utilities import ArxivAPIWrapper from langchain.utilities import PubMedAPIWrapper from langchain.utilities.bing_search import BingSearchAPIWrapper @@ -53,6 +55,7 @@ from langchain.utilities.twilio import TwilioAPIWrapper from langchain.utilities.wikipedia import WikipediaAPIWrapper from langchain.utilities.wolfram_alpha import WolframAlphaAPIWrapper from langchain.utilities.openweathermap import OpenWeatherMapAPIWrapper +from langchain.utilities.dataforseo_api_search import DataForSeoAPIWrapper def _get_python_repl() -> BaseTool: @@ -278,6 +281,14 @@ def _get_openweathermap(**kwargs: Any) -> BaseTool: return OpenWeatherMapQueryRun(api_wrapper=OpenWeatherMapAPIWrapper(**kwargs)) +def _get_dataforseo_api_search(**kwargs: Any) -> BaseTool: + return DataForSeoAPISearchRun(api_wrapper=DataForSeoAPIWrapper(**kwargs)) + + +def _get_dataforseo_api_search_json(**kwargs: Any) -> BaseTool: + return DataForSeoAPISearchResults(api_wrapper=DataForSeoAPIWrapper(**kwargs)) + + _EXTRA_LLM_TOOLS: Dict[ str, Tuple[Callable[[Arg(BaseLanguageModel, "llm"), KwArg(Any)], BaseTool], List[str]], @@ -326,6 +337,14 @@ _EXTRA_OPTIONAL_TOOLS: Dict[str, Tuple[Callable[[KwArg(Any)], BaseTool], List[st "sceneXplain": (_get_scenexplain, []), "graphql": (_get_graphql_tool, ["graphql_endpoint"]), "openweathermap-api": (_get_openweathermap, ["openweathermap_api_key"]), + "dataforseo-api-search": ( + _get_dataforseo_api_search, + ["api_login", "api_password", "aiosession"], + ), + "dataforseo-api-search-json": ( + _get_dataforseo_api_search_json, + ["api_login", "api_password", "aiosession"], + ), } diff --git a/langchain/tools/dataforseo_api_search/__init__.py b/langchain/tools/dataforseo_api_search/__init__.py new file mode 100644 index 0000000000..299ed386bf --- /dev/null +++ b/langchain/tools/dataforseo_api_search/__init__.py @@ -0,0 +1,9 @@ +from langchain.tools.dataforseo_api_search.tool import ( + DataForSeoAPISearchResults, + DataForSeoAPISearchRun, +) + +"""DataForSeo API Toolkit.""" +"""Tool for the DataForSeo SERP API.""" + +__all__ = ["DataForSeoAPISearchRun", "DataForSeoAPISearchResults"] diff --git a/langchain/tools/dataforseo_api_search/tool.py b/langchain/tools/dataforseo_api_search/tool.py new file mode 100644 index 0000000000..bc85e7b1ac --- /dev/null +++ b/langchain/tools/dataforseo_api_search/tool.py @@ -0,0 +1,71 @@ +"""Tool for the DataForSeo SERP API.""" + +from typing import Optional + +from pydantic.fields import Field + +from langchain.callbacks.manager import ( + AsyncCallbackManagerForToolRun, + CallbackManagerForToolRun, +) +from langchain.tools.base import BaseTool +from langchain.utilities.dataforseo_api_search import DataForSeoAPIWrapper + + +class DataForSeoAPISearchRun(BaseTool): + """Tool that adds the capability to query the DataForSeo Google search API.""" + + name = "dataforseo_api_search" + description = ( + "A robust Google Search API provided by DataForSeo." + "This tool is handy when you need information about trending topics " + "or current events." + ) + api_wrapper: DataForSeoAPIWrapper + + def _run( + self, + query: str, + run_manager: Optional[CallbackManagerForToolRun] = None, + ) -> str: + """Use the tool.""" + return str(self.api_wrapper.run(query)) + + async def _arun( + self, + query: str, + run_manager: Optional[AsyncCallbackManagerForToolRun] = None, + ) -> str: + """Use the tool asynchronously.""" + return (await self.api_wrapper.arun(query)).__str__() + + +class DataForSeoAPISearchResults(BaseTool): + """Tool that has capability to query the DataForSeo Google Search API + and get back json.""" + + name = "DataForSeo Results JSON" + description = ( + "A comprehensive Google Search API provided by DataForSeo." + "This tool is useful for obtaining real-time data on current events " + "or popular searches." + "The input should be a search query and the output is a JSON object " + "of the query results." + ) + api_wrapper: DataForSeoAPIWrapper = Field(default_factory=DataForSeoAPIWrapper) + + def _run( + self, + query: str, + run_manager: Optional[CallbackManagerForToolRun] = None, + ) -> str: + """Use the tool.""" + return str(self.api_wrapper.results(query)) + + async def _arun( + self, + query: str, + run_manager: Optional[AsyncCallbackManagerForToolRun] = None, + ) -> str: + """Use the tool asynchronously.""" + return (await self.api_wrapper.aresults(query)).__str__() diff --git a/langchain/utilities/dataforseo_api_search.py b/langchain/utilities/dataforseo_api_search.py new file mode 100644 index 0000000000..a197a12d50 --- /dev/null +++ b/langchain/utilities/dataforseo_api_search.py @@ -0,0 +1,186 @@ +import base64 +from typing import Dict, Optional +from urllib.parse import quote + +import aiohttp +import requests +from pydantic import BaseModel, Extra, Field, root_validator + +from langchain.utils import get_from_dict_or_env + + +class DataForSeoAPIWrapper(BaseModel): + class Config: + """Configuration for this pydantic object.""" + + extra = Extra.forbid + arbitrary_types_allowed = True + + default_params: dict = Field( + default={ + "location_name": "United States", + "language_code": "en", + "depth": 10, + "se_name": "google", + "se_type": "organic", + } + ) + params: dict = Field(default={}) + api_login: Optional[str] = None + api_password: Optional[str] = None + json_result_types: Optional[list] = None + json_result_fields: Optional[list] = None + top_count: Optional[int] = None + aiosession: Optional[aiohttp.ClientSession] = None + + @root_validator() + def validate_environment(cls, values: Dict) -> Dict: + """Validate that login and password exists in environment.""" + login = get_from_dict_or_env(values, "api_login", "DATAFORSEO_LOGIN") + password = get_from_dict_or_env(values, "api_password", "DATAFORSEO_PASSWORD") + values["api_login"] = login + values["api_password"] = password + return values + + async def arun(self, url: str) -> str: + """Run request to DataForSEO SERP API and parse result async.""" + return self._process_response(await self._aresponse_json(url)) + + def run(self, url: str) -> str: + """Run request to DataForSEO SERP API and parse result async.""" + return self._process_response(self._response_json(url)) + + def results(self, url: str) -> list: + res = self._response_json(url) + return self._filter_results(res) + + async def aresults(self, url: str) -> list: + res = await self._aresponse_json(url) + return self._filter_results(res) + + def _prepare_request(self, keyword: str) -> dict: + """Prepare the request details for the DataForSEO SERP API.""" + if self.api_login is None or self.api_password is None: + raise ValueError("api_login or api_password is not provided") + cred = base64.b64encode( + f"{self.api_login}:{self.api_password}".encode("utf-8") + ).decode("utf-8") + headers = {"Authorization": f"Basic {cred}", "Content-Type": "application/json"} + obj = {"keyword": quote(keyword)} + obj = {**obj, **self.default_params, **self.params} + data = [obj] + _url = ( + f"https://api.dataforseo.com/v3/serp/{obj['se_name']}" + f"/{obj['se_type']}/live/advanced" + ) + return { + "url": _url, + "headers": headers, + "data": data, + } + + def _check_response(self, response: dict) -> dict: + """Check the response from the DataForSEO SERP API for errors.""" + if response.get("status_code") != 20000: + raise ValueError( + f"Got error from DataForSEO SERP API: {response.get('status_message')}" + ) + return response + + def _response_json(self, url: str) -> dict: + """Use requests to run request to DataForSEO SERP API and return results.""" + request_details = self._prepare_request(url) + response = requests.post( + request_details["url"], + headers=request_details["headers"], + json=request_details["data"], + ) + response.raise_for_status() + return self._check_response(response.json()) + + async def _aresponse_json(self, url: str) -> dict: + """Use aiohttp to request DataForSEO SERP API and return results async.""" + request_details = self._prepare_request(url) + if not self.aiosession: + async with aiohttp.ClientSession() as session: + async with session.post( + request_details["url"], + headers=request_details["headers"], + json=request_details["data"], + ) as response: + res = await response.json() + else: + async with self.aiosession.post( + request_details["url"], + headers=request_details["headers"], + json=request_details["data"], + ) as response: + res = await response.json() + return self._check_response(res) + + def _filter_results(self, res: dict) -> list: + output = [] + types = self.json_result_types if self.json_result_types is not None else [] + for task in res.get("tasks", []): + for result in task.get("result", []): + for item in result.get("items", []): + if len(types) == 0 or item.get("type", "") in types: + self._cleanup_unnecessary_items(item) + if len(item) != 0: + output.append(item) + if self.top_count is not None and len(output) >= self.top_count: + break + return output + + def _cleanup_unnecessary_items(self, d: dict) -> dict: + fields = self.json_result_fields if self.json_result_fields is not None else [] + if len(fields) > 0: + for k, v in list(d.items()): + if isinstance(v, dict): + self._cleanup_unnecessary_items(v) + if len(v) == 0: + del d[k] + elif k not in fields: + del d[k] + + if "xpath" in d: + del d["xpath"] + if "position" in d: + del d["position"] + if "rectangle" in d: + del d["rectangle"] + for k, v in list(d.items()): + if isinstance(v, dict): + self._cleanup_unnecessary_items(v) + return d + + def _process_response(self, res: dict) -> str: + """Process response from DataForSEO SERP API.""" + toret = "No good search result found" + for task in res.get("tasks", []): + for result in task.get("result", []): + item_types = result.get("item_types") + items = result.get("items", []) + if "answer_box" in item_types: + toret = next( + item for item in items if item.get("type") == "answer_box" + ).get("text") + elif "knowledge_graph" in item_types: + toret = next( + item for item in items if item.get("type") == "knowledge_graph" + ).get("description") + elif "featured_snippet" in item_types: + toret = next( + item for item in items if item.get("type") == "featured_snippet" + ).get("description") + elif "shopping" in item_types: + toret = next( + item for item in items if item.get("type") == "shopping" + ).get("price") + elif "organic" in item_types: + toret = next( + item for item in items if item.get("type") == "organic" + ).get("description") + if toret: + break + return toret diff --git a/tests/integration_tests/utilities/test_dataforseo_api.py b/tests/integration_tests/utilities/test_dataforseo_api.py new file mode 100644 index 0000000000..8a6a140bc8 --- /dev/null +++ b/tests/integration_tests/utilities/test_dataforseo_api.py @@ -0,0 +1,58 @@ +"""Integration test for Dataforseo API Wrapper.""" +import pytest + +from langchain.utilities.dataforseo_api_search import DataForSeoAPIWrapper + + +def test_search_call() -> None: + search = DataForSeoAPIWrapper() + output = search.run("pi value") + assert "3.14159" in output + + +def test_news_call() -> None: + search = DataForSeoAPIWrapper( + params={"se_type": "news"}, json_result_fields=["title", "snippet"] + ) + output = search.results("iphone") + assert any("Apple" in d["title"] or "Apple" in d["snippet"] for d in output) + + +def test_loc_call() -> None: + search = DataForSeoAPIWrapper( + params={"location_name": "Spain", "language_code": "es"} + ) + output = search.results("iphone") + assert "/es/" in output[0]["url"] + + +def test_maps_call() -> None: + search = DataForSeoAPIWrapper( + params={"location_name": "Spain", "language_code": "es", "se_type": "maps"} + ) + output = search.results("coffee") + assert all(i["address_info"]["country_code"] == "ES" for i in output) + + +def test_events_call() -> None: + search = DataForSeoAPIWrapper( + params={"location_name": "Spain", "language_code": "es", "se_type": "events"} + ) + output = search.results("concerts") + assert any( + "Madrid" in ((i["location_info"] or dict())["address"] or "") for i in output + ) + + +@pytest.mark.asyncio +async def test_async_call() -> None: + search = DataForSeoAPIWrapper() + output = await search.arun("pi value") + assert "3.14159" in output + + +@pytest.mark.asyncio +async def test_async_results() -> None: + search = DataForSeoAPIWrapper(json_result_types=["answer_box"]) + output = await search.aresults("New York timezone") + assert "Eastern Daylight Time" in output[0]["text"]