Google Scholar Search Tool using serpapi (#11513)

- **Description:** Implementing the Google Scholar Tool as requested in PR #11505. The tool will be using the [serpapi python package](https://serpapi.com/integrations/python#search-google-scholar). The main idea of the tool will be to return the results from a Google Scholar search given a query as an input to the tool. - **Tag maintainer:** @baskaryan, @eyurtsev, @hwchase17
12 months ago · d5400f6502
parent e542bf1b6b
commit d5400f6502
6 changed files with 283 additions and 0 deletions
--- a/docs/docs_skeleton/docs/integrations/tools/google_scholar.ipynb
+++ b/docs/docs_skeleton/docs/integrations/tools/google_scholar.ipynb
@ -0,0 +1,102 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Google Scholar\n",
+    "\n",
+    "This notebook goes through how to use Google Scholar Tool"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Requirement already satisfied: google-search-results in /home/mohtashimkhan/mambaforge/envs/langchain/lib/python3.9/site-packages (2.4.2)\n",
+      "Requirement already satisfied: requests in /home/mohtashimkhan/mambaforge/envs/langchain/lib/python3.9/site-packages (from google-search-results) (2.31.0)\n",
+      "Requirement already satisfied: charset-normalizer<4,>=2 in /home/mohtashimkhan/mambaforge/envs/langchain/lib/python3.9/site-packages (from requests->google-search-results) (3.3.0)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /home/mohtashimkhan/mambaforge/envs/langchain/lib/python3.9/site-packages (from requests->google-search-results) (3.4)\n",
+      "Requirement already satisfied: urllib3<3,>=1.21.1 in /home/mohtashimkhan/mambaforge/envs/langchain/lib/python3.9/site-packages (from requests->google-search-results) (1.26.17)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /home/mohtashimkhan/mambaforge/envs/langchain/lib/python3.9/site-packages (from requests->google-search-results) (2023.5.7)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install google-search-results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain.tools.google_scholar import GoogleScholarQueryRun\n",
+    "from langchain.utilities.google_scholar import GoogleScholarAPIWrapper\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Title: Large language models (LLM) and ChatGPT: what will the impact on nuclear medicine be?\\nAuthors: IL Alberts,K Shi\\nSummary: IL Alberts, L Mercolli, T Pyka, G Prenosil, K Shi… - European journal of …, 2023 - Springer\\nTotal-Citations: 28\\n\\nTitle: Dynamic Planning with a LLM\\nAuthors: G Dagan,F Keller,A Lascarides\\nSummary: G Dagan, F Keller, A Lascarides - arXiv preprint arXiv:2308.06391, 2023 - arxiv.org\\nTotal-Citations: 3\\n\\nTitle: Openagi: When llm meets domain experts\\nAuthors: Y Ge,W Hua,J Ji,J Tan,S Xu,Y Zhang\\nSummary: Y Ge, W Hua, J Ji, J Tan, S Xu, Y Zhang - arXiv preprint arXiv:2304.04370, 2023 - arxiv.org\\nTotal-Citations: 19\\n\\nTitle: Llm-planner: Few-shot grounded planning for embodied agents with large language models\\nAuthors: CH Song\\nSummary: CH Song, J Wu, C Washington… - Proceedings of the …, 2023 - openaccess.thecvf.com\\nTotal-Citations: 28\\n\\nTitle: The science of detecting llm-generated texts\\nAuthors: R Tang,YN Chuang,X Hu\\nSummary: R Tang, YN Chuang, X Hu - arXiv preprint arXiv:2303.07205, 2023 - arxiv.org\\nTotal-Citations: 23\\n\\nTitle: X-llm: Bootstrapping advanced large language models by treating multi-modalities as foreign languages\\nAuthors: F Chen,M Han,J Shi\\nSummary: F Chen, M Han, H Zhao, Q Zhang, J Shi, S Xu… - arXiv preprint arXiv …, 2023 - arxiv.org\\nTotal-Citations: 12\\n\\nTitle: 3d-llm: Injecting the 3d world into large language models\\nAuthors: Y Hong,H Zhen,P Chen,S Zheng,Y Du\\nSummary: Y Hong, H Zhen, P Chen, S Zheng, Y Du… - arXiv preprint arXiv …, 2023 - arxiv.org\\nTotal-Citations: 4\\n\\nTitle: The internal state of an llm knows when its lying\\nAuthors: A Azaria,T Mitchell\\nSummary: A Azaria, T Mitchell - arXiv preprint arXiv:2304.13734, 2023 - arxiv.org\\nTotal-Citations: 18\\n\\nTitle: LLM-Pruner: On the Structural Pruning of Large Language Models\\nAuthors: X Ma,G Fang,X Wang\\nSummary: X Ma, G Fang, X Wang - arXiv preprint arXiv:2305.11627, 2023 - arxiv.org\\nTotal-Citations: 15\\n\\nTitle: Large language models are few-shot testers: Exploring llm-based general bug reproduction\\nAuthors: S Kang,J Yoon,S Yoo\\nSummary: S Kang, J Yoon, S Yoo - 2023 IEEE/ACM 45th International …, 2023 - ieeexplore.ieee.org\\nTotal-Citations: 17'"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "os.environ[\"SERP_API_KEY\"] = \"\"\n",
+    "tool = GoogleScholarQueryRun(api_wrapper=GoogleScholarAPIWrapper())\n",
+    "tool.run(\"LLM Models\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.16 ('langchain')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "15e58ce194949b77a891bd4339ce3d86a9bd138e905926019517993f97db9e6c"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/libs/langchain/langchain/agents/load_tools.py
+++ b/libs/langchain/langchain/agents/load_tools.py
@ -34,6 +34,7 @@ from langchain.tools.base import BaseTool
 from langchain.tools.bing_search.tool import BingSearchRun
 from langchain.tools.ddg_search.tool import DuckDuckGoSearchRun
 from langchain.tools.google_search.tool import GoogleSearchResults, GoogleSearchRun
+from langchain.tools.google_scholar.tool import GoogleScholarQueryRun
 from langchain.tools.metaphor_search.tool import MetaphorSearchResults
 from langchain.tools.google_serper.tool import GoogleSerperResults, GoogleSerperRun
 from langchain.tools.searchapi.tool import SearchAPIResults, SearchAPIRun
@ -64,6 +65,7 @@ from langchain.utilities.bing_search import BingSearchAPIWrapper
 from langchain.utilities.duckduckgo_search import DuckDuckGoSearchAPIWrapper
 from langchain.utilities.google_search import GoogleSearchAPIWrapper
 from langchain.utilities.google_serper import GoogleSerperAPIWrapper
+from langchain.utilities.google_scholar import GoogleScholarAPIWrapper
 from langchain.utilities.metaphor_search import MetaphorSearchAPIWrapper
 from langchain.utilities.awslambda import LambdaWrapper
 from langchain.utilities.graphql import GraphQLAPIWrapper
@ -222,6 +224,10 @@ def _get_google_serper(**kwargs: Any) -> BaseTool:
    return GoogleSerperRun(api_wrapper=GoogleSerperAPIWrapper(**kwargs))


+def _get_google_scholar(**kwargs: Any) -> BaseTool:
+    return GoogleScholarQueryRun(api_wrapper=GoogleScholarAPIWrapper(**kwargs))
+
+
 def _get_google_serper_results_json(**kwargs: Any) -> BaseTool:
    return GoogleSerperResults(api_wrapper=GoogleSerperAPIWrapper(**kwargs))

@ -337,6 +343,10 @@ _EXTRA_OPTIONAL_TOOLS: Dict[str, Tuple[Callable[[KwArg(Any)], BaseTool], List[st
    "metaphor-search": (_get_metaphor_search, ["metaphor_api_key"]),
    "ddg-search": (_get_ddg_search, []),
    "google-serper": (_get_google_serper, ["serper_api_key", "aiosession"]),
+    "google-scholar": (
+        _get_google_scholar,
+        ["top_k_results", "hl", "lr", "serp_api_key"],
+    ),
    "google-serper-results-json": (
        _get_google_serper_results_json,
        ["serper_api_key", "aiosession"],
--- a/libs/langchain/langchain/tools/google_scholar/init.py
+++ b/libs/langchain/langchain/tools/google_scholar/init.py
@ -0,0 +1,5 @@
+"""Google Scholar API Toolkit."""
+
+from langchain.tools.google_scholar.tool import GoogleScholarQueryRun
+
+__all__ = ["GoogleScholarQueryRun"]
--- a/libs/langchain/langchain/tools/google_scholar/tool.py
+++ b/libs/langchain/langchain/tools/google_scholar/tool.py
@ -0,0 +1,28 @@
+"""Tool for the Google Scholar"""
+
+from typing import Optional
+
+from langchain.callbacks.manager import CallbackManagerForToolRun
+from langchain.tools.base import BaseTool
+from langchain.utilities.google_scholar import GoogleScholarAPIWrapper
+
+
+class GoogleScholarQueryRun(BaseTool):
+    """Tool that queries the Google search API."""
+
+    name: str = "google_scholar"
+    description: str = (
+        "A wrapper around Google Scholar Search. "
+        "Useful for when you need to get information about"
+        "research papers from Google Scholar"
+        "Input should be a search query."
+    )
+    api_wrapper: GoogleScholarAPIWrapper
+
+    def _run(
+        self,
+        query: str,
+        run_manager: Optional[CallbackManagerForToolRun] = None,
+    ) -> str:
+        """Use the tool."""
+        return self.api_wrapper.run(query)
--- a/libs/langchain/langchain/utilities/init.py
+++ b/libs/langchain/langchain/utilities/init.py
@ -74,6 +74,12 @@ def _import_google_places_api() -> Any:
    return GooglePlacesAPIWrapper


+def _import_google_scholar() -> Any:
+    from langchain.utilities.google_scholar import GoogleScholarAPIWrapper
+
+    return GoogleScholarAPIWrapper
+
+
 def _import_google_search() -> Any:
    from langchain.utilities.google_search import GoogleSearchAPIWrapper

@ -225,6 +231,8 @@ def __getattr__(name: str) -> Any:
        return _import_duckduckgo_search()
    elif name == "GoldenQueryAPIWrapper":
        return _import_golden_query()
+    elif name == "GoogleScholarAPIWrapper":
+        return _import_google_scholar()
    elif name == "GooglePlacesAPIWrapper":
        return _import_google_places_api()
    elif name == "GoogleSearchAPIWrapper":
@ -286,6 +294,7 @@ __all__ = [
    "DuckDuckGoSearchAPIWrapper",
    "GoldenQueryAPIWrapper",
    "GooglePlacesAPIWrapper",
+    "GoogleScholarAPIWrapper",
    "GoogleSearchAPIWrapper",
    "GoogleSerperAPIWrapper",
    "GraphQLAPIWrapper",
--- a/libs/langchain/langchain/utilities/google_scholar.py
+++ b/libs/langchain/langchain/utilities/google_scholar.py
@ -0,0 +1,129 @@
+"""Util that calls Google Scholar Search."""
+from typing import Dict, Optional
+
+from langchain.pydantic_v1 import BaseModel, Extra, root_validator
+from langchain.utils import get_from_dict_or_env
+
+
+class GoogleScholarAPIWrapper(BaseModel):
+    """Wrapper for Google Scholar API
+
+    You can create serpapi key by signing up at: https://serpapi.com/users/sign_up.
+
+    The wrapper uses the serpapi python package:
+    https://serpapi.com/integrations/python#search-google-scholar
+
+    To use, you should have the environment variable ``SERP_API_KEY``
+    set with your API key, or pass `serp_api_key` as a named parameter
+    to the constructor.
+
+    Attributes:
+        top_k_results: number of results to return from google-scholar query search.
+            By default it returns top 10 results.
+        hl: attribute defines the language to use for the Google Scholar search.
+            It's a two-letter language code.
+            (e.g., en for English, es for Spanish, or fr for French). Head to the
+            Google languages page for a full list of supported Google languages:
+            https://serpapi.com/google-languages
+
+        lr: attribute defines one or multiple languages to limit the search to.
+            It uses lang_{two-letter language code} to specify languages
+            and | as a delimiter. (e.g., lang_fr|lang_de will only search French
+            and German pages). Head to the Google lr languages for a full
+            list of supported languages: https://serpapi.com/google-lr-languages
+
+     Example:
+        .. code-block:: python
+
+        from langchain.utilities import GoogleScholarAPIWrapper
+        google_scholar = GoogleScholarAPIWrapper()
+        google_scholar.run('langchain')
+    """
+
+    top_k_results: int = 10
+    hl: str = "en"
+    lr: str = "lang_en"
+    serp_api_key: Optional[str] = None
+
+    class Config:
+        """Configuration for this pydantic object."""
+
+        extra = Extra.forbid
+
+    @root_validator()
+    def validate_environment(cls, values: Dict) -> Dict:
+        """Validate that api key and python package exists in environment."""
+        serp_api_key = get_from_dict_or_env(values, "serp_api_key", "SERP_API_KEY")
+        values["SERP_API_KEY"] = serp_api_key
+
+        try:
+            from serpapi import GoogleScholarSearch
+
+        except ImportError:
+            raise ImportError(
+                "google-search-results is not installed. "
+                "Please install it with `pip install google-search-results"
+                ">=2.4.2`"
+            )
+        GoogleScholarSearch.SERP_API_KEY = serp_api_key
+        values["google_scholar_engine"] = GoogleScholarSearch
+
+        return values
+
+    def run(self, query: str) -> str:
+        """Run query through GoogleSearchScholar and parse result"""
+        total_results = []
+        page = 0
+        while page < max((self.top_k_results - 20), 1):
+            # We are getting 20 results from every page
+            # which is the max in order to reduce the number of API CALLS.
+            # 0 is the first page of results, 20 is the 2nd page of results,
+            # 40 is the 3rd page of results, etc.
+            results = (
+                self.google_scholar_engine(  # type: ignore
+                    {
+                        "q": query,
+                        "start": page,
+                        "hl": self.hl,
+                        "num": min(
+                            self.top_k_results, 20
+                        ),  # if top_k_result is less than 20.
+                        "lr": self.lr,
+                    }
+                )
+                .get_dict()
+                .get("organic_results", [])
+            )
+            total_results.extend(results)
+            if not results:  # No need to search for more pages if current page
+                # has returned no results
+                break
+            page += 20
+        if (
+            self.top_k_results % 20 != 0 and page > 20 and total_results
+        ):  # From the last page we would only need top_k_results%20 results
+            # if k is not divisible by 20.
+            results = (
+                self.google_scholar_engine(  # type: ignore
+                    {
+                        "q": query,
+                        "start": page,
+                        "num": self.top_k_results % 20,
+                        "hl": self.hl,
+                        "lr": self.lr,
+                    }
+                )
+                .get_dict()
+                .get("organic_results", [])
+            )
+            total_results.extend(results)
+        if not total_results:
+            return "No good Google Scholar Result was found"
+        docs = [
+            f"Title: {result.get('title','')}\n"
+            f"Authors: {','.join([author.get('name') for author in result.get('publication_info',{}).get('authors',[])])}\n"  # noqa: E501
+            f"Summary: {result.get('publication_info',{}).get('summary','')}\n"
+            f"Total-Citations: {result.get('inline_links',{}).get('cited_by',{}).get('total','')}"  # noqa: E501
+            for result in total_results
+        ]
+        return "\n\n".join(docs)