diff --git a/docs/docs/integrations/tools/wikidata.ipynb b/docs/docs/integrations/tools/wikidata.ipynb new file mode 100644 index 0000000000..0bcf74d08b --- /dev/null +++ b/docs/docs/integrations/tools/wikidata.ipynb @@ -0,0 +1,73 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "c4b39799", + "metadata": {}, + "source": [ + "# Wikidata\n", + "\n", + ">[Wikidata](https://wikidata.org/) is a free and open knowledge base that can be read and edited by both humans and machines. Wikidata is one of the world's largest open knowledge bases.\n", + "\n", + "First, you need to install `wikibase-rest-api-client` and `mediawikiapi` python packages." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d9195d4", + "metadata": { + "vscode": { + "languageId": "shellscript" + } + }, + "outputs": [], + "source": [ + "%pip install --upgrade --quiet wikibase-rest-api-client mediawikiapi" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "955988a1-ebc2-4c9a-9298-c493fe842de1", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.tools.wikidata.tool import WikidataAPIWrapper, WikidataQueryRun\n", + "\n", + "wikidata = WikidataQueryRun(api_wrapper=WikidataAPIWrapper())" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9926a8a7-3e4e-4a97-ba43-7e5a274b9561", + "metadata": {}, + "outputs": [], + "source": [ + "print(wikidata.run(\"Alan Turing\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/libs/community/langchain_community/tools/wikidata/__init__.py b/libs/community/langchain_community/tools/wikidata/__init__.py new file mode 100644 index 0000000000..a3b32ff4d9 --- /dev/null +++ b/libs/community/langchain_community/tools/wikidata/__init__.py @@ -0,0 +1 @@ +"""Wikidata API toolkit.""" diff --git a/libs/community/langchain_community/tools/wikidata/tool.py b/libs/community/langchain_community/tools/wikidata/tool.py new file mode 100644 index 0000000000..c34096cf01 --- /dev/null +++ b/libs/community/langchain_community/tools/wikidata/tool.py @@ -0,0 +1,30 @@ +"""Tool for the Wikidata API.""" + +from typing import Optional + +from langchain_core.callbacks import CallbackManagerForToolRun +from langchain_core.tools import BaseTool + +from langchain_community.utilities.wikidata import WikidataAPIWrapper + + +class WikidataQueryRun(BaseTool): + """Tool that searches the Wikidata API.""" + + name: str = "Wikidata" + description: str = ( + "A wrapper around Wikidata. " + "Useful for when you need to answer general questions about " + "people, places, companies, facts, historical events, or other subjects. " + "Input should be the exact name of the item you want information about " + "or a Wikidata QID." + ) + api_wrapper: WikidataAPIWrapper + + def _run( + self, + query: str, + run_manager: Optional[CallbackManagerForToolRun] = None, + ) -> str: + """Use the Wikidata tool.""" + return self.api_wrapper.run(query) diff --git a/libs/community/langchain_community/utilities/wikidata.py b/libs/community/langchain_community/utilities/wikidata.py new file mode 100644 index 0000000000..c31009d78d --- /dev/null +++ b/libs/community/langchain_community/utilities/wikidata.py @@ -0,0 +1,181 @@ +"""Util that calls Wikidata.""" + +import logging +from typing import Any, Dict, List, Optional + +from langchain_core.documents import Document +from langchain_core.pydantic_v1 import BaseModel, root_validator + +logger = logging.getLogger(__name__) + +WIKIDATA_MAX_QUERY_LENGTH = 300 +# Common properties you probably want to see filtered from https://www.wikidata.org/wiki/Wikidata:Database_reports/List_of_properties/all +DEFAULT_PROPERTIES = [ + "P31", + "P279", + "P27", + "P361", + "P527", + "P495", + "P17", + "P585", + "P131", + "P106", + "P21", + "P569", + "P570", + "P577", + "P50", + "P571", + "P641", + "P625", + "P19", + "P69", + "P108", + "P136", + "P39", + "P161", + "P20", + "P101", + "P179", + "P175", + "P7937", + "P57", + "P607", + "P509", + "P800", + "P449", + "P580", + "P582", + "P276", + "P69", + "P112", + "P740", + "P159", + "P452", + "P102", + "P1142", + "P1387", + "P1576", + "P140", + "P178", + "P287", + "P25", + "P22", + "P40", + "P185", + "P802", + "P1416", +] +DEFAULT_LANG_CODE = "en" +WIKIDATA_USER_AGENT = "langchain-wikidata" +WIKIDATA_API_URL = "https://www.wikidata.org/w/api.php" +WIKIDATA_REST_API_URL = "https://www.wikidata.org/w/rest.php/wikibase/v0/" + + +class WikidataAPIWrapper(BaseModel): + """Wrapper around the Wikidata API. + + To use, you should have the ``wikibase-rest-api-client`` and + ``mediawikiapi `` python packages installed. + This wrapper will use the Wikibase APIs to conduct searches and + fetch item content. By default, it will return the item content + of the top-k results. + It limits the Document content by doc_content_chars_max. + """ + + wikidata_mw: Any #: :meta private: + wikidata_rest: Any # : :meta private: + top_k_results: int = 2 + load_all_available_meta: bool = False + doc_content_chars_max: int = 4000 + wikidata_props: List[str] = DEFAULT_PROPERTIES + lang: str = DEFAULT_LANG_CODE + + @root_validator() + def validate_environment(cls, values: Dict) -> Dict: + """Validate that the python package exists in environment.""" + try: + from mediawikiapi import MediaWikiAPI + from mediawikiapi.config import Config + + values["wikidata_mw"] = MediaWikiAPI( + Config(user_agent=WIKIDATA_USER_AGENT, mediawiki_url=WIKIDATA_API_URL) + ) + except ImportError: + raise ImportError( + "Could not import mediawikiapi python package. " + "Please install it with `pip install mediawikiapi`." + ) + + try: + from wikibase_rest_api_client import Client + + client = Client( + timeout=60, + base_url=WIKIDATA_REST_API_URL, + headers={"User-Agent": WIKIDATA_USER_AGENT}, + follow_redirects=True, + ) + values["wikidata_rest"] = client + except ImportError: + raise ImportError( + "Could not import wikibase_rest_api_client python package. " + "Please install it with `pip install wikibase-rest-api-client`." + ) + return values + + def _item_to_document(self, qid: str) -> Optional[Document]: + from wikibase_rest_api_client.utilities.fluent import FluentWikibaseClient + + fluent_client: FluentWikibaseClient = FluentWikibaseClient( + self.wikidata_rest, supported_props=self.wikidata_props, lang=self.lang + ) + resp = fluent_client.get_item(qid) + + if not resp: + logger.warning(f"Could not find item {qid} in Wikidata") + return None + + doc_lines = [] + if resp.label: + doc_lines.append(f"Label: {resp.label}") + if resp.description: + doc_lines.append(f"Description: {resp.description}") + if resp.aliases: + doc_lines.append(f"Aliases: {', '.join(resp.aliases)}") + for prop, values in resp.statements.items(): + if values: + doc_lines.append(f"{prop.label}: {', '.join(values)}") + + return Document( + page_content=("\n".join(doc_lines))[: self.doc_content_chars_max], + meta={"title": qid, "source": f"https://www.wikidata.org/wiki/{qid}"}, + ) + + def load(self, query: str) -> List[Document]: + """ + Run Wikidata search and get the item documents plus the meta information. + """ + + clipped_query = query[:WIKIDATA_MAX_QUERY_LENGTH] + items = self.wikidata_mw.search(clipped_query, results=self.top_k_results) + docs = [] + for item in items[: self.top_k_results]: + if doc := self._item_to_document(item): + docs.append(doc) + return docs + + def run(self, query: str) -> str: + """Run Wikidata search and get item summaries.""" + + clipped_query = query[:WIKIDATA_MAX_QUERY_LENGTH] + items = self.wikidata_mw.search(clipped_query, results=self.top_k_results) + + docs = [] + for item in items[: self.top_k_results]: + if doc := self._item_to_document(item): + docs.append(f"Result {item}:\n{doc.page_content}") + if not docs: + return "No good Wikidata Search Result was found" + return "\n\n".join(docs)[: self.doc_content_chars_max]