community: Wikidata tool support (#16691)

- **Description:** Adds Wikidata support to langchain. Can read out
documents from Wikidata.
  - **Issue:** N/A
- **Dependencies:** Adds implicit dependencies for
`wikibase-rest-api-client` (for turning items into docs) and
`mediawikiapi` (for hitting the search endpoint)
  - **Twitter handle:** @derenrich

You can see an example of this tool used in a chain
[here](https://nbviewer.org/urls/d.erenrich.net/upload/Wikidata_Langchain.ipynb)
or
[here](https://nbviewer.org/urls/d.erenrich.net/upload/Wikidata_Lars_Kai_Hansen.ipynb)

<!-- Thank you for contributing to LangChain!


Please make sure your PR is passing linting and testing before
submitting. Run `make format`, `make lint` and `make test` from the root
of the package you've modified to check this locally.

See contribution guidelines for more information on how to write/run
tests, lint, etc: https://python.langchain.com/docs/contributing/

If you're adding a new integration, please include:
1. a test for the integration, preferably unit tests that do not rely on
network access,
2. an example notebook showing its use. It lives in
`docs/docs/integrations` directory.

If no one reviews your PR within a few days, please @-mention one of
@baskaryan, @eyurtsev, @hwchase17.
 -->
pull/16256/head
Daniel Erenrich 5 months ago committed by GitHub
parent 6ef718c5f4
commit 0600998f38
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

@ -0,0 +1,73 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "c4b39799",
"metadata": {},
"source": [
"# Wikidata\n",
"\n",
">[Wikidata](https://wikidata.org/) is a free and open knowledge base that can be read and edited by both humans and machines. Wikidata is one of the world's largest open knowledge bases.\n",
"\n",
"First, you need to install `wikibase-rest-api-client` and `mediawikiapi` python packages."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3d9195d4",
"metadata": {
"vscode": {
"languageId": "shellscript"
}
},
"outputs": [],
"source": [
"%pip install --upgrade --quiet wikibase-rest-api-client mediawikiapi"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "955988a1-ebc2-4c9a-9298-c493fe842de1",
"metadata": {},
"outputs": [],
"source": [
"from langchain_community.tools.wikidata.tool import WikidataAPIWrapper, WikidataQueryRun\n",
"\n",
"wikidata = WikidataQueryRun(api_wrapper=WikidataAPIWrapper())"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "9926a8a7-3e4e-4a97-ba43-7e5a274b9561",
"metadata": {},
"outputs": [],
"source": [
"print(wikidata.run(\"Alan Turing\"))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

@ -0,0 +1,30 @@
"""Tool for the Wikidata API."""
from typing import Optional
from langchain_core.callbacks import CallbackManagerForToolRun
from langchain_core.tools import BaseTool
from langchain_community.utilities.wikidata import WikidataAPIWrapper
class WikidataQueryRun(BaseTool):
"""Tool that searches the Wikidata API."""
name: str = "Wikidata"
description: str = (
"A wrapper around Wikidata. "
"Useful for when you need to answer general questions about "
"people, places, companies, facts, historical events, or other subjects. "
"Input should be the exact name of the item you want information about "
"or a Wikidata QID."
)
api_wrapper: WikidataAPIWrapper
def _run(
self,
query: str,
run_manager: Optional[CallbackManagerForToolRun] = None,
) -> str:
"""Use the Wikidata tool."""
return self.api_wrapper.run(query)

@ -0,0 +1,181 @@
"""Util that calls Wikidata."""
import logging
from typing import Any, Dict, List, Optional
from langchain_core.documents import Document
from langchain_core.pydantic_v1 import BaseModel, root_validator
logger = logging.getLogger(__name__)
WIKIDATA_MAX_QUERY_LENGTH = 300
# Common properties you probably want to see filtered from https://www.wikidata.org/wiki/Wikidata:Database_reports/List_of_properties/all
DEFAULT_PROPERTIES = [
"P31",
"P279",
"P27",
"P361",
"P527",
"P495",
"P17",
"P585",
"P131",
"P106",
"P21",
"P569",
"P570",
"P577",
"P50",
"P571",
"P641",
"P625",
"P19",
"P69",
"P108",
"P136",
"P39",
"P161",
"P20",
"P101",
"P179",
"P175",
"P7937",
"P57",
"P607",
"P509",
"P800",
"P449",
"P580",
"P582",
"P276",
"P69",
"P112",
"P740",
"P159",
"P452",
"P102",
"P1142",
"P1387",
"P1576",
"P140",
"P178",
"P287",
"P25",
"P22",
"P40",
"P185",
"P802",
"P1416",
]
DEFAULT_LANG_CODE = "en"
WIKIDATA_USER_AGENT = "langchain-wikidata"
WIKIDATA_API_URL = "https://www.wikidata.org/w/api.php"
WIKIDATA_REST_API_URL = "https://www.wikidata.org/w/rest.php/wikibase/v0/"
class WikidataAPIWrapper(BaseModel):
"""Wrapper around the Wikidata API.
To use, you should have the ``wikibase-rest-api-client`` and
``mediawikiapi `` python packages installed.
This wrapper will use the Wikibase APIs to conduct searches and
fetch item content. By default, it will return the item content
of the top-k results.
It limits the Document content by doc_content_chars_max.
"""
wikidata_mw: Any #: :meta private:
wikidata_rest: Any # : :meta private:
top_k_results: int = 2
load_all_available_meta: bool = False
doc_content_chars_max: int = 4000
wikidata_props: List[str] = DEFAULT_PROPERTIES
lang: str = DEFAULT_LANG_CODE
@root_validator()
def validate_environment(cls, values: Dict) -> Dict:
"""Validate that the python package exists in environment."""
try:
from mediawikiapi import MediaWikiAPI
from mediawikiapi.config import Config
values["wikidata_mw"] = MediaWikiAPI(
Config(user_agent=WIKIDATA_USER_AGENT, mediawiki_url=WIKIDATA_API_URL)
)
except ImportError:
raise ImportError(
"Could not import mediawikiapi python package. "
"Please install it with `pip install mediawikiapi`."
)
try:
from wikibase_rest_api_client import Client
client = Client(
timeout=60,
base_url=WIKIDATA_REST_API_URL,
headers={"User-Agent": WIKIDATA_USER_AGENT},
follow_redirects=True,
)
values["wikidata_rest"] = client
except ImportError:
raise ImportError(
"Could not import wikibase_rest_api_client python package. "
"Please install it with `pip install wikibase-rest-api-client`."
)
return values
def _item_to_document(self, qid: str) -> Optional[Document]:
from wikibase_rest_api_client.utilities.fluent import FluentWikibaseClient
fluent_client: FluentWikibaseClient = FluentWikibaseClient(
self.wikidata_rest, supported_props=self.wikidata_props, lang=self.lang
)
resp = fluent_client.get_item(qid)
if not resp:
logger.warning(f"Could not find item {qid} in Wikidata")
return None
doc_lines = []
if resp.label:
doc_lines.append(f"Label: {resp.label}")
if resp.description:
doc_lines.append(f"Description: {resp.description}")
if resp.aliases:
doc_lines.append(f"Aliases: {', '.join(resp.aliases)}")
for prop, values in resp.statements.items():
if values:
doc_lines.append(f"{prop.label}: {', '.join(values)}")
return Document(
page_content=("\n".join(doc_lines))[: self.doc_content_chars_max],
meta={"title": qid, "source": f"https://www.wikidata.org/wiki/{qid}"},
)
def load(self, query: str) -> List[Document]:
"""
Run Wikidata search and get the item documents plus the meta information.
"""
clipped_query = query[:WIKIDATA_MAX_QUERY_LENGTH]
items = self.wikidata_mw.search(clipped_query, results=self.top_k_results)
docs = []
for item in items[: self.top_k_results]:
if doc := self._item_to_document(item):
docs.append(doc)
return docs
def run(self, query: str) -> str:
"""Run Wikidata search and get item summaries."""
clipped_query = query[:WIKIDATA_MAX_QUERY_LENGTH]
items = self.wikidata_mw.search(clipped_query, results=self.top_k_results)
docs = []
for item in items[: self.top_k_results]:
if doc := self._item_to_document(item):
docs.append(f"Result {item}:\n{doc.page_content}")
if not docs:
return "No good Wikidata Search Result was found"
return "\n\n".join(docs)[: self.doc_content_chars_max]
Loading…
Cancel
Save