mirror of
https://github.com/hwchase17/langchain
synced 2024-11-10 01:10:59 +00:00
196 lines
7.7 KiB
Python
196 lines
7.7 KiB
Python
|
import base64
|
||
|
from typing import Dict, Optional
|
||
|
from urllib.parse import quote
|
||
|
|
||
|
import aiohttp
|
||
|
import requests
|
||
|
from langchain_core.pydantic_v1 import BaseModel, Extra, Field, root_validator
|
||
|
from langchain_core.utils import get_from_dict_or_env
|
||
|
|
||
|
|
||
|
class DataForSeoAPIWrapper(BaseModel):
|
||
|
"""Wrapper around the DataForSeo API."""
|
||
|
|
||
|
class Config:
|
||
|
"""Configuration for this pydantic object."""
|
||
|
|
||
|
extra = Extra.forbid
|
||
|
arbitrary_types_allowed = True
|
||
|
|
||
|
default_params: dict = Field(
|
||
|
default={
|
||
|
"location_name": "United States",
|
||
|
"language_code": "en",
|
||
|
"depth": 10,
|
||
|
"se_name": "google",
|
||
|
"se_type": "organic",
|
||
|
}
|
||
|
)
|
||
|
"""Default parameters to use for the DataForSEO SERP API."""
|
||
|
params: dict = Field(default={})
|
||
|
"""Additional parameters to pass to the DataForSEO SERP API."""
|
||
|
api_login: Optional[str] = None
|
||
|
"""The API login to use for the DataForSEO SERP API."""
|
||
|
api_password: Optional[str] = None
|
||
|
"""The API password to use for the DataForSEO SERP API."""
|
||
|
json_result_types: Optional[list] = None
|
||
|
"""The JSON result types."""
|
||
|
json_result_fields: Optional[list] = None
|
||
|
"""The JSON result fields."""
|
||
|
top_count: Optional[int] = None
|
||
|
"""The number of top results to return."""
|
||
|
aiosession: Optional[aiohttp.ClientSession] = None
|
||
|
"""The aiohttp session to use for the DataForSEO SERP API."""
|
||
|
|
||
|
@root_validator()
|
||
|
def validate_environment(cls, values: Dict) -> Dict:
|
||
|
"""Validate that login and password exists in environment."""
|
||
|
login = get_from_dict_or_env(values, "api_login", "DATAFORSEO_LOGIN")
|
||
|
password = get_from_dict_or_env(values, "api_password", "DATAFORSEO_PASSWORD")
|
||
|
values["api_login"] = login
|
||
|
values["api_password"] = password
|
||
|
return values
|
||
|
|
||
|
async def arun(self, url: str) -> str:
|
||
|
"""Run request to DataForSEO SERP API and parse result async."""
|
||
|
return self._process_response(await self._aresponse_json(url))
|
||
|
|
||
|
def run(self, url: str) -> str:
|
||
|
"""Run request to DataForSEO SERP API and parse result async."""
|
||
|
return self._process_response(self._response_json(url))
|
||
|
|
||
|
def results(self, url: str) -> list:
|
||
|
res = self._response_json(url)
|
||
|
return self._filter_results(res)
|
||
|
|
||
|
async def aresults(self, url: str) -> list:
|
||
|
res = await self._aresponse_json(url)
|
||
|
return self._filter_results(res)
|
||
|
|
||
|
def _prepare_request(self, keyword: str) -> dict:
|
||
|
"""Prepare the request details for the DataForSEO SERP API."""
|
||
|
if self.api_login is None or self.api_password is None:
|
||
|
raise ValueError("api_login or api_password is not provided")
|
||
|
cred = base64.b64encode(
|
||
|
f"{self.api_login}:{self.api_password}".encode("utf-8")
|
||
|
).decode("utf-8")
|
||
|
headers = {"Authorization": f"Basic {cred}", "Content-Type": "application/json"}
|
||
|
obj = {"keyword": quote(keyword)}
|
||
|
obj = {**obj, **self.default_params, **self.params}
|
||
|
data = [obj]
|
||
|
_url = (
|
||
|
f"https://api.dataforseo.com/v3/serp/{obj['se_name']}"
|
||
|
f"/{obj['se_type']}/live/advanced"
|
||
|
)
|
||
|
return {
|
||
|
"url": _url,
|
||
|
"headers": headers,
|
||
|
"data": data,
|
||
|
}
|
||
|
|
||
|
def _check_response(self, response: dict) -> dict:
|
||
|
"""Check the response from the DataForSEO SERP API for errors."""
|
||
|
if response.get("status_code") != 20000:
|
||
|
raise ValueError(
|
||
|
f"Got error from DataForSEO SERP API: {response.get('status_message')}"
|
||
|
)
|
||
|
return response
|
||
|
|
||
|
def _response_json(self, url: str) -> dict:
|
||
|
"""Use requests to run request to DataForSEO SERP API and return results."""
|
||
|
request_details = self._prepare_request(url)
|
||
|
response = requests.post(
|
||
|
request_details["url"],
|
||
|
headers=request_details["headers"],
|
||
|
json=request_details["data"],
|
||
|
)
|
||
|
response.raise_for_status()
|
||
|
return self._check_response(response.json())
|
||
|
|
||
|
async def _aresponse_json(self, url: str) -> dict:
|
||
|
"""Use aiohttp to request DataForSEO SERP API and return results async."""
|
||
|
request_details = self._prepare_request(url)
|
||
|
if not self.aiosession:
|
||
|
async with aiohttp.ClientSession() as session:
|
||
|
async with session.post(
|
||
|
request_details["url"],
|
||
|
headers=request_details["headers"],
|
||
|
json=request_details["data"],
|
||
|
) as response:
|
||
|
res = await response.json()
|
||
|
else:
|
||
|
async with self.aiosession.post(
|
||
|
request_details["url"],
|
||
|
headers=request_details["headers"],
|
||
|
json=request_details["data"],
|
||
|
) as response:
|
||
|
res = await response.json()
|
||
|
return self._check_response(res)
|
||
|
|
||
|
def _filter_results(self, res: dict) -> list:
|
||
|
output = []
|
||
|
types = self.json_result_types if self.json_result_types is not None else []
|
||
|
for task in res.get("tasks", []):
|
||
|
for result in task.get("result", []):
|
||
|
for item in result.get("items", []):
|
||
|
if len(types) == 0 or item.get("type", "") in types:
|
||
|
self._cleanup_unnecessary_items(item)
|
||
|
if len(item) != 0:
|
||
|
output.append(item)
|
||
|
if self.top_count is not None and len(output) >= self.top_count:
|
||
|
break
|
||
|
return output
|
||
|
|
||
|
def _cleanup_unnecessary_items(self, d: dict) -> dict:
|
||
|
fields = self.json_result_fields if self.json_result_fields is not None else []
|
||
|
if len(fields) > 0:
|
||
|
for k, v in list(d.items()):
|
||
|
if isinstance(v, dict):
|
||
|
self._cleanup_unnecessary_items(v)
|
||
|
if len(v) == 0:
|
||
|
del d[k]
|
||
|
elif k not in fields:
|
||
|
del d[k]
|
||
|
|
||
|
if "xpath" in d:
|
||
|
del d["xpath"]
|
||
|
if "position" in d:
|
||
|
del d["position"]
|
||
|
if "rectangle" in d:
|
||
|
del d["rectangle"]
|
||
|
for k, v in list(d.items()):
|
||
|
if isinstance(v, dict):
|
||
|
self._cleanup_unnecessary_items(v)
|
||
|
return d
|
||
|
|
||
|
def _process_response(self, res: dict) -> str:
|
||
|
"""Process response from DataForSEO SERP API."""
|
||
|
toret = "No good search result found"
|
||
|
for task in res.get("tasks", []):
|
||
|
for result in task.get("result", []):
|
||
|
item_types = result.get("item_types")
|
||
|
items = result.get("items", [])
|
||
|
if "answer_box" in item_types:
|
||
|
toret = next(
|
||
|
item for item in items if item.get("type") == "answer_box"
|
||
|
).get("text")
|
||
|
elif "knowledge_graph" in item_types:
|
||
|
toret = next(
|
||
|
item for item in items if item.get("type") == "knowledge_graph"
|
||
|
).get("description")
|
||
|
elif "featured_snippet" in item_types:
|
||
|
toret = next(
|
||
|
item for item in items if item.get("type") == "featured_snippet"
|
||
|
).get("description")
|
||
|
elif "shopping" in item_types:
|
||
|
toret = next(
|
||
|
item for item in items if item.get("type") == "shopping"
|
||
|
).get("price")
|
||
|
elif "organic" in item_types:
|
||
|
toret = next(
|
||
|
item for item in items if item.get("type") == "organic"
|
||
|
).get("description")
|
||
|
if toret:
|
||
|
break
|
||
|
return toret
|